<a href="https://colab.research.google.com/github/ErenCodeTitan/ErenCodeTitan/blob/main/Automate_detection_and_recognition_of_grammatical_errors_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
spacy = spacy.load("en_core_web_sm")

In [None]:
spacy.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
docx = open('gdrive/My Drive/Colab Notebooks/text/text.txt').read()
docx = spacy(docx)
dir(docx)
print(docx[0])

SpaCy


In [None]:
list(docx.sents)

[SpaCy is an open-source software library for advanced NLP in Python. ,
 It is designed specifically for production use and helps you build applications 
 that process and understand large volumes of text.]

In [None]:
all_tags = {w.pos: w.pos_ for w in docx}

for word in list(docx.sents)[0]:
  print(word, word.tag_)

SpaCy NNP
is VBZ
an DT
open JJ
- HYPH
source NN
software NN
library NN
for IN
advanced JJ
NLP NNP
in IN
Python NNP
. .

 _SP


In [None]:
[chunk.text for chunk in docx.noun_chunks]

['SpaCy',
 'an open-source software library',
 'advanced NLP',
 'Python',
 'It',
 'production use',
 'you',
 'applications',
 'that process',
 'large volumes',
 'text']

In [None]:
from spacy import displacy
displacy.render(docx, style='dep')
displacy.render(docx, style='ent')

In [None]:
#define some parameters
noisy_pos_tags = ["PROP"]
min_token_length = 2

#Function to check if the token is a noise or not
def isNoise(token):
  is_noise = False
  if token.pos_ in noisy_pos_tags:
    is_noise = True
  elif token.is_stop == True:
    is_noise = True
  elif len(str(token)) <= min_token_length:
    is_noise = True
  return is_noise
def cleanup(token, lower = True):
  if lower:
    token = token.lower()
  return token.strip()

#top unigrams used in the reviews
from collections import Counter
cleaned_list = [cleanup(str(word)) for word in docx if not isNoise(word)]
Counter(cleaned_list) .most_common(5)

# Entity Detection
print("\nEntity Detection...\n")
label = set([w.label_ for w in docx.ents])
for i in label:
  entities = [cleanup(str(e), lower=False) for e in docx.ents if i==e.label_]
  entities = list(set(entities))
  print(i, entities)

#Displaying tokens
for token in docx:
  print (token.text, token.tag_, token.head.text, token.dep_)
#  Displaying Noun Phrases
print("\nDisplaying Noun Phrases")
for np in docx.noun_chunks:
  print(np.text, np.root.dep_, np.root.head.text)




Entity Detection...

ORG ['NLP']
GPE ['Python']
SpaCy NNP is nsubj
is VBZ is ROOT
an DT library det
open JJ source amod
- HYPH source punct
source NN library compound
software NN library compound
library NN is attr
for IN library prep
advanced JJ NLP amod
NLP NNP for pobj
in IN NLP prep
Python NNP in pobj
. . is punct

 _SP . dep
It PRP designed nsubjpass
is VBZ designed auxpass
designed VBN designed ROOT
specifically RB designed advmod
for IN designed prep
production NN use compound
use NN for pobj
and CC designed cc
helps VBZ designed conj
you PRP build nsubj
build VB helps ccomp
applications NNS build dobj

 _SP applications dep
that DT process det
process NN build dobj
and CC build cc
understand VB build conj
large JJ volumes amod
volumes NNS understand dobj
of IN volumes prep
text NN of pobj
. . designed punct

Displaying Noun Phrases
SpaCy nsubj is
an open-source software library attr is
advanced NLP pobj for
Python pobj in
It nsubjpass designed
production use pobj for
you nsubj

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import string
punctuations = string.punctuation

from spacy.lang.en import English
pars = English()

#Custom transformer using spacy
class predictor(TransformerMixin):
  def transform(self, X, **transform_params):
    return [clean_text(text) for text in X]
  def fit(self, X, y=None, **fit_params):
    return self
  def get_params(self, deep=True):
    return {}

#Basic Utility function to clean the text

def clean_text(text):
  return text.strip().lower()

def spacy_tokenizer(sentence):
  tokens = spacy(sentence)
  tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
  tokens = [tok for tok in tokens if (tok not in sklearn_stop_words and tok not in punctuations)]
  return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

#create the pipeline to clean, tokenize, vectorize and classify
pipe = Pipeline([("cleaner", predictor()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])




In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import FunctionTransformer


In [None]:
base_classifiers = [
    ('knn', KNeighborsClassifier()),
    ('tree', DecisionTreeClassifier()),
    ('nb' , GaussianNB()),
   ('KNC' , KNeighborsClassifier()),
    ('xgb' , GradientBoostingClassifier()),
    ('svc',LinearSVC())
]
from sklearn.ensemble import VotingClassifier
voting_classifier = VotingClassifier(base_classifiers)

pipe2 = Pipeline([("cleaner", predictor()),
                 ('vectorizer', vectorizer),
                 ('dense_transformer', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
                 ('classifier',voting_classifier)
])



In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv("gdrive/My Drive/Colab Notebooks/datasets/training_data.csv")

In [None]:
df_train.head()

Unnamed: 0,input,labels
0,I am reading score of Mahler is Symphony No . .,0
1,I am not interested in cars or electric applia...,1
2,This is my homework for my English class .,0
3,"In comparison , Canada is catches increased an...",0
4,"Fortunately , my older sister is friend is a d...",1


In [None]:
df_train.loc[df_train['labels']==0, 'labels']='incorrect'

  df_train.loc[df_train['labels']==0, 'labels']='incorrect'


In [None]:
df_train.loc[df_train['labels']==1, 'labels']='correct'

In [None]:
train=[(df_train.iloc[i][0],df_train.iloc[i][1]) for i in range(len(df_train))]

  train=[(df_train.iloc[i][0],df_train.iloc[i][1]) for i in range(len(df_train))]


In [None]:
train

[('I am reading score of Mahler is Symphony No . .', 'incorrect'),
 ('I am not interested in cars or electric appliances .', 'correct'),
 ('This is my homework for my English class .', 'incorrect'),
 ('In comparison , Canada is catches increased and decreased at steadier rate , although similarly to US .',
  'incorrect'),
 ('Fortunately , my older sister is friend is a dentist .', 'correct'),
 ('Also , some recycling methods for plastics use more oil to use machinery than making new one from oil .',
  'correct'),
 ('Comparison Contrast', 'correct'),
 ('See you .', 'incorrect'),
 ('Talking with Kyouke from about to .', 'correct'),
 ('biodiversity n .', 'incorrect'),
 ('I understand it is a dream .', 'correct'),
 ('I request many things to her so often .', 'incorrect'),
 ('Working - Holiday is a system for youngmen to taravel abroad with working in some country .',
  'incorrect'),
 ('Nice to meet you .', 'incorrect'),
 ('Is not that easier than the Taiwanese one ?', 'correct'),
 ('I coul

In [None]:
df_test = pd.read_csv("gdrive/My Drive/Colab Notebooks/datasets/validation_data.csv")


In [None]:
df_test.loc[df_test['labels']==0, 'labels']='incorrect'

  df_test.loc[df_test['labels']==0, 'labels']='incorrect'


In [None]:
df_test.loc[df_test['labels']==1, 'labels']='correct'

In [None]:
test = [(df_test.iloc[i][0],df_test.iloc[i][1]) for i in range(len(df_test))]

  test = [(df_test.iloc[i][0],df_test.iloc[i][1]) for i in range(len(df_test))]


In [None]:
test

[('It was the same thing .', 'incorrect'),
 ('I can study idioms a lot ', 'correct'),
 ('I just bet some coins for numbers or colors .', 'incorrect'),
 ('Yesterday , I was checking some e - mails on business matters with my own laptop at home since I was commanded to wait at home by my boss .',
  'correct'),
 ('I regret that I have not played the piano very much for the last weeks .',
  'correct'),
 ('I will try to write a diary everyday to record things happened in my daily life and share it with net friend here .',
  'incorrect'),
 ('A TV performer said on a TV program before that when he wore a T - shirt which was a heavy metal band is on his trip to Australia , he was surrounded by men who looked very strong .',
  'correct'),
 ('She is Five years old .', 'incorrect'),
 ('A silicon steamer , which a friend of my ex - colleagues gave me as a wedding gift , will help me to cook vegetables .',
  'correct'),
 ('I do not imagine whether my life is long or short .', 'correct'),
 ('What a 

In [None]:
pipe_line = Pipeline([
    ("cleaner", predictor()),
    ('vectorizer', vectorizer),
    ('classifier', LinearSVC())
])

pipe_line.fit([x[0] for x in train], [x[1] for x in train])
pred_data = pipe_line.predict([x[0] for x in test])

for (sample, pred) in zip(test, pred_data):
    print(sample, pred)

print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
('so i want you to check my draft , ', 'incorrect') correct
('Driving Japanese car , I feel it trustworthy .', 'incorrect') incorrect
(' - Vacation Most of the learners may have a long vacation within three months and it can be an obstacle from continuing to learn .', 'incorrect') correct
('Good morning .', 'correct') incorrect
('You are very good at writing English .', 'incorrect') incorrect
('Over the sugar completed !', 'correct') correct
('Recently Japanese prefers to eat light sweet jam .', 'incorrect') incorrect
('Find a partner to talk in English .', 'correct') correct
('So , I will be happy if you become my friend .', 'incorrect') correct
('next year .', 'correct') correct
('We filled the remaining time by going to a museum to appreciate art .', 'incorrect') incorrect
('when you will talk to children or friends who are not need to be young', 'incorrect') correct
('They came my house before I married .', 'incorrect

In [None]:
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

Accuracy: 0.6137
