## TFIDF First Simple Model
Logistic Regression out of the box, n-grams=1

In [139]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline

import re

import warnings
warnings.filterwarnings('ignore')

In [65]:
df = pd.read_csv('processed_text')
df.head()

Unnamed: 0,Grade,Text,lemmatized,grammarized
0,1,I had just gone to Chobot Space and Science Ce...,i have just go to chobot space and science cen...,I had just gone to + chobot + space and + scie...
1,1,My cat is fluffy. His name is Buzz. He is my f...,my cat be fluffy his name be buzz he be my fav...,+ my cat is fluffy . + his name is + buzz . + ...
2,1,Spring is sweet because we can go boat riding ...,spring be sweet because we can go boat riding ...,+ spring is sweet because we can go boat ridin...
3,1,One day baby Josh came home. He was in a yello...,one day baby josh come home he be in a yellow ...,+ one day baby + josh came home . + he was in ...
4,1,One time I went to Mexico. It was a blast! I m...,one time i go to mexico it be a blast i meet p...,+ one time I went to + mexico . + it was a bla...


In [158]:
def assess_model(model, scores, X_train, y_train, ngram_range=(1,3)):
    for ngram in range(ngram_range[0],ngram_range[1]+1):
        lr_count_pipe = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,ngram))),
                                  ('logreg', model)])

        lemma_scores = [round(x,3)*100 for x in cross_val_score(lr_count_pipe, X_train['lemmatized'], y_train, cv=3)]
        grammar_scores = [round(x,3)*100 for x in cross_val_score(lr_count_pipe, X_train['grammarized'], y_train, cv=3)]
        scores = scores.append({'model':'Decision Tree Classifier',
                        'encoding':'Count Vectors',
                        'ngram':ngram,
                       'lemmas':lemma_scores,
                      'grammar':grammar_scores},
                      ignore_index=True)

        lr_tfidf_pipe = Pipeline([('vectorizer', TfidfVectorizer(ngram_range=(1,ngram))),
                              ('logreg', model)])

        lemma_scores = [round(x,3)*100 for x in cross_val_score(lr_tfidf_pipe, X_train['lemmatized'], y_train, cv=3)]
        grammar_scores = [round(x,3)*100 for x in cross_val_score(lr_tfidf_pipe, X_train['grammarized'], y_train, cv=3)]
        scores = scores.append({'model':'Decision Tree Classifier',
                        'encoding':'TF-IDF Vectors',
                        'ngram':ngram,
                       'lemmas':lemma_scores,
                      'grammar':grammar_scores},
                      ignore_index=True)
        print('finished ngram', ngram)
        
    return scores
                 

In [152]:
X = df[['lemmatized', 'grammarized']]
y = df.Grade
## Split lemma train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=11)
scores = pd.DataFrame()

In [154]:
scores = assess_model(DecisionTreeClassifier(), scores, X_train, y_train, ngram_range=(1,3))

finished ngram 1
finished ngram 2
finished ngram 3


In [156]:
scores = assess_model(LogisticRegression(), scores, X_train, y_train, ngram_range=(1,3))

finished ngram 1
finished ngram 2
finished ngram 3


In [157]:
scores

Unnamed: 0,encoding,grammar,lemmas,model,ngram
0,Count Vectors,"[0.259, 0.296, 0.212]","[0.259, 0.321, 0.175]",Decision Tree Classifier,1.0
1,TF-IDF Vectors,"[0.198, 0.185, 0.25]","[0.173, 0.21, 0.238]",Decision Tree Classifier,1.0
2,Count Vectors,"[0.272, 0.272, 0.2]","[0.136, 0.284, 0.188]",Decision Tree Classifier,2.0
3,TF-IDF Vectors,"[0.185, 0.247, 0.225]","[0.136, 0.173, 0.238]",Decision Tree Classifier,2.0
4,Count Vectors,"[0.259, 0.333, 0.225]","[0.173, 0.284, 0.188]",Decision Tree Classifier,3.0
5,TF-IDF Vectors,"[0.136, 0.222, 0.225]","[0.16, 0.222, 0.212]",Decision Tree Classifier,3.0
6,Count Vectors,"[0.235, 0.272, 0.188]","[0.222, 0.185, 0.175]",Decision Tree Classifier,1.0
7,TF-IDF Vectors,"[0.198, 0.259, 0.188]","[0.235, 0.235, 0.2]",Decision Tree Classifier,1.0
8,Count Vectors,"[0.247, 0.259, 0.175]","[0.222, 0.185, 0.162]",Decision Tree Classifier,2.0
9,TF-IDF Vectors,"[0.198, 0.198, 0.15]","[0.21, 0.185, 0.162]",Decision Tree Classifier,2.0


In [159]:
df.isna().sum()

Grade          0
Text           0
lemmatized     0
grammarized    0
dtype: int64