# Models to try
- kNN
- SVM
- Random Forest

In [106]:
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from transformers import AutoTokenizer

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

nltk.download('stopwords', download_dir='../../Resources')
nltk.data.path.append('../../Resources')

[nltk_data] Downloading package stopwords to ../../Resources...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
STOPWORDS = set(stopwords.words('english'))
ps = PorterStemmer()

def remove_stopwords(text):
    return ' '.join([word for word in str(text).split() if word not in STOPWORDS])

def data_preprocess(text, remove_punctuation=True):
    if remove_punctuation: 
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = text.strip()
    text = re.sub(r'\n', '', text)
    text = text.lower()
    text = ps.stem(text)
    text = remove_stopwords(text)
    return text

In [6]:
orig_data = pd.read_csv('../Data/train.csv')
orig_data

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


In [7]:
aug_data = pd.read_csv('../Data/aug_data.csv')
aug_data

Unnamed: 0,full_text
0,"Dear local newspaper, I think effects computer..."
1,"Dear Generic_Name Generic_Name, I believe that..."
2,"Dear, Generic_Name Generic_Name Generic_Name M..."
3,"Dear Local Newspaper, Generic_Name I have foun..."
4,"Dear LOCATION_NAME, I know having computers ha..."
...,...
5870,In most stories mothers and daughters are eit...
5871,I never understood the meaning laughter is th...
5872,"When you laugh, is Generic_Name out of habit, ..."
5873,Trippin' on fen...


In [15]:
orig_data = orig_data[['full_text']].copy()

In [16]:
orig_data.loc[:, 'full_text'] = orig_data['full_text'].apply(data_preprocess)
aug_data.loc[:, 'full_text'] = aug_data['full_text'].apply(data_preprocess)

In [20]:
# make sure data is balanced
aug_data = aug_data.sample(orig_data.shape[0], random_state=42)

In [21]:
orig_data.shape, aug_data.shape

((3911, 1), (3911, 1))

In [22]:
orig_data['target'] = 0
aug_data['target'] = 1

In [23]:
data = pd.concat([orig_data, aug_data], axis=0).reset_index(drop=True)
data

Unnamed: 0,full_text,target
0,"think students would benefit learning home,bec...",0
1,problem change let best matter happening chang...,0
2,"dear, principalif u change school policy grade...",0
3,best time life become yourself. agree greatest...,0
4,small act kindness impact people change people...,0
...,...,...
7817,yes children reading anything thats like that....,1
7818,example remember fun laughter life generic_nam...,1
7819,"many resaons taking book, movie, anything else...",1
7820,"dear generic_name, believe computers benefitua...",1


In [24]:
# shuffle
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data

Unnamed: 0,full_text,target
0,good behavior trying influence per swaying oth...,0
1,"libraies able remove books, movies, magizines ...",1
2,writing students work schools homes. people th...,0
3,agree churchill statement important role failu...,0
4,"dear generic_name, ""generic_name generic_name ...",1
...,...,...
7817,"censorship, sensitive topic. generic_name many...",1
7818,"dear local newspaper, generic_name big agument...",1
7819,teenagers really complicated day's trying extr...,0
7820,ed censorship certain media controversial topi...,1


In [25]:
X, y = data[['full_text']], data['target']

In [27]:
MODEL_NAME = 'distilbert-base-uncased'

In [29]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [90]:
def tokenize(example):
    return np.array(tokenizer(example['full_text'], 
                    max_length=512, 
                    padding='max_length', 
                    truncation=True, 
                    add_special_tokens=True,
                    return_tensors=None)['input_ids'])

In [97]:
X = pd.DataFrame(list(X.apply(tokenize, axis=1).values), columns=list(range(512)))

In [98]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,101,2204,5248,2667,3747,2566,21826,2500,1029,11082,...,0,0,0,0,0,0,0,0,0,0
1,101,5622,10024,3111,2583,6366,2808,1010,5691,1010,...,0,0,0,0,0,0,0,0,0,0
2,101,3015,2493,2147,2816,5014,1012,2111,2228,2190,...,0,0,0,0,0,0,0,0,0,0
3,101,5993,10888,4861,2590,2535,4945,3248,8463,3112,...,0,0,0,0,0,0,0,0,0,0
4,101,6203,12391,1035,2171,1010,1000,12391,1035,2171,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7817,101,15657,1010,7591,8476,1012,12391,1035,2171,2116,...,0,0,0,0,0,0,0,0,0,0
7818,101,6203,2334,3780,1010,12391,1035,2171,2502,12943,...,0,0,0,0,0,0,0,0,0,0
7819,101,12908,2428,8552,2154,1005,1055,2667,4469,1016,...,0,0,0,0,0,0,0,0,0,0
7820,101,3968,15657,3056,2865,6801,8476,1012,3251,3056,...,0,0,0,0,0,0,0,0,0,0


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# kNN

In [116]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [117]:
knn.score(X_test, y_test)

0.5615801704105344

# SVM

In [104]:
svm = make_pipeline(StandardScaler(), SVC(gamma='auto'))
svm.fit(X_train, y_train)

In [105]:
svm.score(X_test, y_test)

0.6250968241673122

# Random Forest

In [120]:
rf = RandomForestClassifier(max_depth=25, random_state=42)
rf.fit(X_train, y_train)

In [121]:
rf.score(X_test, y_test)

0.7358636715724245