<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/dave-updates/code/dave_dummy_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('drive/MyDrive/696/text_difficulty_features.csv',sep='\t')
df.head(5)

Unnamed: 0,original_text,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,label
0,There is manuscript evidence that Austen conti...,There be manuscript evidence that Austen conti...,8.533221,5.80931,3.57,12.12,2.495517,1.33,4.57,38.0,1
1,"In a remarkable comparative analysis , Mandaea...","In a remarkable comparative analysis , Mandaea...",12.320171,7.402308,2.89,11.94,2.334286,1.46,4.93,21.0,1
2,"Before Persephone was released to Hermes , who...","Before Persephone be release to Hermes , who h...",5.9315,5.231351,2.78,11.17,2.556486,1.43,4.86,40.0,1
3,Cogeneration plants are commonly found in dist...,Cogeneration plant be commonly find in distric...,7.015012,6.742,3.56,11.53,3.369655,1.52,4.93,32.0,1
4,Geneva is the second-most-populous city in Swi...,Geneva be the second-most-populous city in Swi...,7.550745,5.455,3.69,12.62,2.399333,1.43,4.79,22.0,1


In [4]:
df.dtypes

original_text        object
lemmatized_text      object
d_chall_score       float64
aoa_mean            float64
aoa_min             float64
aoa_max             float64
conc_rating_mean    float64
conc_rating_min     float64
conc_rating_max     float64
num_lemmas          float64
label                 int64
dtype: object

In [5]:
# Extract only the columns we'll use for the classifier
df = df[df.columns[:]]

In [6]:
train = pd.read_csv('drive/Shareddrives/Milestone 2/Training_set.csv', 
                      sep='\t',index_col=0)
test = pd.read_csv('drive/Shareddrives/Milestone 2/Testing_set.csv', sep='\t',
                     index_col=0)

In [7]:
X_train = train[train.columns[:-1]]
y_train = train[train.columns[-1]]

X_test = test[test.columns[:-1]]
y_test = test[test.columns[-1]]

In [8]:
metrics_df = {"model_name":[],
              "model_instance":[],
              "train_accuracy": [],
              "train_precision": [],
              "train_recall": [],
              "train_aucprc":[],
              "train_aucroc": [],
              "test_accuracy":[],
              "test_precision":[],
              "test_recall":[],
              "test_aucprc":[],
              "test_aucroc": []
              }

In [9]:
from sklearn. metrics import precision_score, recall_score, precision_recall_curve, roc_auc_score, auc

In [10]:
def compute_metrics(model,model_name,X_train_data,y_train_data, X_test_data, y_test_data):
    # extract the model's hyperparameters and make that part of the identifier for the model 

    metrics_df['model_name'].append(model_name)
    metrics_df['model_instance'].append(model)
   
    # run train data
    y_pred_train = model.predict(X_train_data)
    acc = model.score(X_train_data, y_train)
    recall = recall_score(y_train_data,y_pred_train)
    precision = precision_score(y_train_data, y_pred_train)
    y_proba_train = model.predict_proba(X_train_data)[:,1]
    prc,rec,thres = precision_recall_curve(y_train_data, y_proba_train)
    auc_prc = auc(rec,prc)
    roc = roc_auc_score(y_train_data, y_proba_train)
   
   
    metrics_df['train_accuracy'].append(acc)
    metrics_df['train_recall'].append(recall)
    metrics_df['train_precision'].append(precision)
    metrics_df['train_aucprc'].append(auc_prc)
    metrics_df['train_aucroc'].append(roc)


    #run test data
    y_pred = model.predict(X_test_data)
    acc = model.score(X_test_data, y_test)
    recall = recall_score(y_test,y_pred)
    precision = precision_score(y_test, y_pred)
    y_proba = model.predict_proba(X_test_data)[:,1]
    prc,rec,thres = precision_recall_curve(y_test, y_proba)
    auc_prc = auc(rec,prc)
    roc = roc_auc_score(y_test, y_proba)

    metrics_df['test_accuracy'].append(acc)
    metrics_df['test_recall'].append(recall)
    metrics_df['test_precision'].append(precision)
    metrics_df['test_aucprc'].append(auc_prc)
    metrics_df['test_aucroc'].append(roc)








### **Dummy Classifier with Tfidf Vectorizer**

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [28]:
pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', DummyClassifier())
                    ])

parameters = {
    'tfidf__min_df': [25,30],
    'tfidf__stop_words': ['english'],
    'tfidf__ngram_range': [(1, 2)],
    'clf__strategy': ['most_frequent', 'prior', 'stratified', 'uniform']
}

In [29]:
# X_train_vec = tf_vec.fit_transform(X_train['lemmatized_text'])

In [25]:
from sklearn.model_selection import GridSearchCV


In [30]:
grid_dc_f1 = GridSearchCV(pipeline,
                           param_grid=parameters,
                           scoring='f1',
                           n_jobs=-1,
                           cv=3)
grid_dc_f1.fit(X_train['lemmatized_text'],y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', DummyClassifier())]),
             n_jobs=-1,
             param_grid={'clf__strategy': ['most_frequent', 'prior',
                                           'stratified', 'uniform'],
                         'tfidf__min_df': [25, 30],
                         'tfidf__ngram_range': [(1, 2)],
                         'tfidf__stop_words': ['english']},
             scoring='f1')

In [33]:
grid_dc_f1.best_params_

{'clf__strategy': 'most_frequent',
 'tfidf__min_df': 25,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [35]:
model_name = 'Tf-idf DC most_freq min_df 25'

compute_metrics(grid_dc_f1.best_estimator_,model_name,
                X_train['lemmatized_text'],
                y_train,X_test['lemmatized_text'], y_test)



In [36]:
metrics_df

{'model_name': ['Tf-idf DC most_freq min_df 25'],
 'model_instance': [Pipeline(steps=[('tfidf',
                   TfidfVectorizer(min_df=25, ngram_range=(1, 2),
                                   stop_words='english')),
                  ('clf', DummyClassifier(strategy='most_frequent'))])],
 'train_accuracy': [0.5189033099771687],
 'train_precision': [0.5189033099771687],
 'train_recall': [1.0],
 'train_aucprc': [0.7594516549885844],
 'train_aucroc': [0.5],
 'test_accuracy': [0.518915909608523],
 'test_precision': [0.518915909608523],
 'test_recall': [1.0],
 'test_aucprc': [0.7594579548042615],
 'test_aucroc': [0.5]}

# Mads Features

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [38]:
cols = ['d_chall_score','aoa_mean','aoa_min','aoa_max','conc_rating_mean','conc_rating_min', 'conc_rating_max','num_lemmas']
X_train_mads_features = X_train.iloc[:][cols]
X_test_mads_features = X_test.iloc[:][cols]

In [41]:
mads_pipeline = Pipeline([('imputer', SimpleImputer(strategy='mean')),
                          ('scaler', StandardScaler()),
                          ('clf', DummyClassifier())
                         ])

parameters = {
    'clf__strategy': ['most_frequent', 'prior', 'stratified', 'uniform']
}

In [42]:
grid_dc_mads_f1 = (GridSearchCV(mads_pipeline,
                                  param_grid=parameters,
                                  scoring='f1',
                                  n_jobs=-1, 
                                  cv=3).fit(X_train_mads_features,y_train))

In [44]:
grid_dc_mads_f1.best_estimator_

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('clf', DummyClassifier(strategy='most_frequent'))])

In [46]:
model_name = 'MADS DC most_freq stdsclr'
compute_metrics(grid_dc_mads_f1.best_estimator_,
                model_name,
                X_train_mads_features,
                y_train,
                X_test_mads_features,
                y_test)


# tfidf + Mads Features

In [47]:
import scipy
from scipy.sparse import hstack
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import KBinsDiscretizer

In [49]:
vectorizer = Pipeline([
              ('tfidf', TfidfVectorizer(min_df=25,
                                        stop_words='english',
                                        ngram_range=(1, 2)))
              ])

mads_pipe = Pipeline([
              ('impute_mean',SimpleImputer(strategy='mean')),
              ('scaler',St()),
              ('bin',KBinsDiscretizer(n_bins=4))
              ])

preprocessor = ColumnTransformer(transformers=[
    ('mads',mads_pipe,cols),
    ('text', vectorizer, 'lemmatized_text')
    ])


              
pipeline = Pipeline([('preprocessor', preprocessor),
                     ('clf', DummyClassifier())  # classifier
                     ])

In [55]:
parameters = {
    'clf__strategy': ['most_frequent', 'prior', 'stratified', 'uniform']
}


grid_dc_all_f1= GridSearchCV(pipeline,
                              param_grid=parameters,
                              scoring='f1',
                              n_jobs=-1,
                              cv=3).fit(X_train,y_train)


  "decreasing the number of bins." % jj


In [58]:
grid_dc_all_f1.best_params_

{'clf__strategy': 'most_frequent'}

In [59]:
model_name = 'Tf-idf + Mads DC most freq'

# compute metrics from the test split data
compute_metrics(grid_dc_all_f1.best_estimator_,
                model_name,
                X_train,y_train,
                X_test,y_test)

# Sentence Embeddings

In [60]:
from numpy import loadtxt
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [61]:
embeddings = loadtxt('drive/Shareddrives/Milestone 2/embeddings.csv', delimiter=',')
embeddings

array([[-0.10521097,  0.04652381,  0.09932816, ..., -0.53146267,
         0.24081262,  0.04764764],
       [-0.09595645,  0.28408318,  0.0416585 , ..., -0.6049515 ,
         0.23389882,  0.2116979 ],
       [ 0.00624134,  0.04496894,  0.28006756, ..., -0.21483139,
         0.40460399, -0.12047322],
       ...,
       [-0.05062665,  0.11346165,  0.37695   , ..., -0.35318942,
         0.46983531, -0.04529605],
       [-0.13691955,  0.46627818, -0.0528408 , ..., -0.55561154,
         0.56975963,  0.25214309],
       [-0.23494626,  0.30814424,  0.28275521, ..., -0.56363898,
         0.44196188,  0.01724233]])

In [62]:
embeddings.shape

(395169, 100)

In [63]:
# Combine the embeddings and the original_text + Mads features
emb = pd.DataFrame(embeddings)
new_df = pd.concat([df,emb], axis=1)
new_df.head()

Unnamed: 0,original_text,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,...,90,91,92,93,94,95,96,97,98,99
0,There is manuscript evidence that Austen conti...,There be manuscript evidence that Austen conti...,8.533221,5.80931,3.57,12.12,2.495517,1.33,4.57,38.0,...,-0.000574,-0.282132,0.048765,0.281763,-0.562752,-0.018829,-0.266238,-0.531463,0.240813,0.047648
1,"In a remarkable comparative analysis , Mandaea...","In a remarkable comparative analysis , Mandaea...",12.320171,7.402308,2.89,11.94,2.334286,1.46,4.93,21.0,...,0.142222,-0.103547,-0.090528,0.095581,-0.276107,0.028346,-0.250517,-0.604951,0.233899,0.211698
2,"Before Persephone was released to Hermes , who...","Before Persephone be release to Hermes , who h...",5.9315,5.231351,2.78,11.17,2.556486,1.43,4.86,40.0,...,-0.042911,-0.127006,-0.027232,0.002538,-0.531988,-0.153132,-0.162387,-0.214831,0.404604,-0.120473
3,Cogeneration plants are commonly found in dist...,Cogeneration plant be commonly find in distric...,7.015012,6.742,3.56,11.53,3.369655,1.52,4.93,32.0,...,0.11304,-0.48043,-0.121593,0.049611,-0.675361,0.358477,-0.145952,-0.454261,0.61885,0.002099
4,Geneva is the second-most-populous city in Swi...,Geneva be the second-most-populous city in Swi...,7.550745,5.455,3.69,12.62,2.399333,1.43,4.79,22.0,...,-0.003146,-0.214928,-0.110112,0.408002,-0.512044,0.292142,-0.294712,-0.306517,0.631877,-0.067776


In [64]:
X_train_indexes = X_train.index
X_train_emb = new_df.iloc[X_train_indexes, 11:]

# Extract just the embedding data from test set
X_test_indexes = X_test.index
X_test_emb = new_df.iloc[X_test_indexes, 11:]

X_train_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
91224,-0.372383,0.221127,0.418656,-0.075037,0.278601,0.282382,-0.17906,0.063454,-0.416448,0.017981,...,-0.318357,0.0595,-0.174866,0.281088,-0.411315,-0.156737,-0.283365,-0.481805,0.567332,0.058942
117728,0.016294,-0.113769,0.293257,-0.168054,0.222634,0.146599,0.070255,0.014296,-0.186127,0.057335,...,-0.083942,-0.180034,-0.202958,0.465693,-0.490053,0.123509,-0.455105,-0.099749,0.47342,-0.061285
316017,-0.0735,0.113797,0.029153,-0.429477,0.063771,0.34738,-0.124431,0.282229,-0.075735,0.068369,...,0.054707,-0.032472,-0.121287,0.100465,-0.510452,0.222725,-0.246233,-0.182647,0.495054,0.045244
7423,0.029224,-0.239776,0.132184,0.049248,0.145883,0.229183,-0.007984,0.01491,-0.091277,-0.008269,...,0.064875,-0.315753,-0.039297,0.541282,-0.457863,0.255201,-0.281852,-0.214106,0.434917,-0.177841
201052,-0.046389,-0.126177,0.413811,-0.052786,0.227497,0.208976,0.11618,0.549721,0.018257,0.237596,...,0.318063,-0.234182,-0.21874,0.573339,-0.420712,0.340783,-0.341281,-0.248862,0.539164,-0.11174


In [65]:
embed_pipeline = Pipeline([('scaler', StandardScaler()),
                          ('clf',DummyClassifier())
                         ])



grid_dc_emb_f1 = (GridSearchCV(embed_pipeline,
                                 param_grid=parameters,
                                 scoring='f1',
                                 n_jobs=-1, 
                                 cv=3).fit(X_train_emb,y_train))

In [66]:
grid_dc_emb_f1.best_params_

{'clf__strategy': 'most_frequent'}

In [67]:
model_name = 'embedding DC most_freq'
# X_test_emb_mm = scaler.transform(X_test_emb)
compute_metrics(grid_dc_emb_f1.best_estimator_,
                model_name,
                X_train_emb,
                y_train,
                X_test_emb,
                y_test)

In [None]:
dc_emb = DummyClassifier(strategy='most_frequent').fit(X_train_emb_std, y_train)
print(dc_emb.score(X_train_emb_std, y_train))

0.5189033099771687


In [None]:
grid_dc_emb_acc = GridSearchCV(DummyClassifier(), param_grid=param_grid)
grid_dc_emb_acc.fit(X_train_emb_std,y_train)

GridSearchCV(estimator=DummyClassifier(),
             param_grid={'strategy': ['most_frequent', 'prior', 'stratified',
                                      'uniform']})

**Sentence Embeddings + Mads Features**

In [68]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, Normalizer

In [69]:
X_train_emb_mads = new_df.iloc[X_train.index].copy()
X_test_emb_mads = new_df.iloc[X_test.index].copy()

X_train_emb_mads.drop(['original_text','lemmatized_text','label'],inplace=True,axis=1)
X_test_emb_mads.drop(['original_text','lemmatized_text','label'],inplace=True,axis=1)

In [70]:
mads_pipe = Pipeline([
              ('impute_mean',SimpleImputer(strategy='mean')),
              ])

preprocessor = ColumnTransformer(transformers=[
    ('mads',mads_pipe,cols)
    ])


              
pipeline = Pipeline([('preprocessor', preprocessor),
                     ('scaler',StandardScaler()),
                     ('clf', DummyClassifier())  # classifier
                     ])

In [71]:
grid_dc_embmads_f1 = GridSearchCV(pipeline,
                                    param_grid=parameters,
                                    scoring='f1',
                                    n_jobs=-1,
                                    cv=3)
grid_dc_embmads_f1.fit(X_train_emb_mads,y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('mads',
                                                                         Pipeline(steps=[('impute_mean',
                                                                                          SimpleImputer())]),
                                                                         ['d_chall_score',
                                                                          'aoa_mean',
                                                                          'aoa_min',
                                                                          'aoa_max',
                                                                          'conc_rating_mean',
                                                                          'conc_rating_min',
                                                                          'conc_rating_max',
  

In [72]:
grid_dc_embmads_f1.best_params_

{'clf__strategy': 'most_frequent'}

In [73]:
model_name += 'emb + Mads dc most_freq'
compute_metrics(grid_dc_embmads_f1.best_estimator_,
                model_name,
                X_train_emb_mads,y_train, X_test_emb_mads, y_test)

In [None]:
grid_dc_embmads_acc = GridSearchCV(DummyClassifier(), param_grid=param_grid)
grid_dc_embmads_acc.fit(X_train_emb_mads,y_train)

GridSearchCV(estimator=DummyClassifier(),
             param_grid={'strategy': ['most_frequent', 'prior', 'stratified',
                                      'uniform']})

In [None]:
grid_dc_embmads_acc.best_params_

{'strategy': 'most_frequent'}

In [None]:
X_test_emb_mads = np.hstack((X_test_emb_std,
                     X_test_mads_features_std))

model_name = 'embedding + Mads most_freq'
compute_metrics(grid_dc_embmads_acc.best_estimator_,
                model_name,
                X_test_emb_mads,
                y_test)

In [74]:
metrics = pd.DataFrame(metrics_df)

In [75]:
metrics

Unnamed: 0,model_name,model_instance,train_accuracy,train_precision,train_recall,train_aucprc,train_aucroc,test_accuracy,test_precision,test_recall,test_aucprc,test_aucroc
0,Tf-idf DC most_freq min_df 25,"(TfidfVectorizer(min_df=25, ngram_range=(1, 2)...",0.518903,0.518903,1.0,0.759452,0.5,0.518916,0.518916,1.0,0.759458,0.5
1,MADS DC most_freq stdsclr,"(SimpleImputer(), StandardScaler(), DummyClass...",0.518903,0.518903,1.0,0.759452,0.5,0.518916,0.518916,1.0,0.759458,0.5
2,Tf-idf + Mads DC most freq,"(ColumnTransformer(transformers=[('mads',\n ...",0.518903,0.518903,1.0,0.759452,0.5,0.518916,0.518916,1.0,0.759458,0.5
3,embedding DC most_freq,"(StandardScaler(), DummyClassifier(strategy='m...",0.518903,0.518903,1.0,0.759452,0.5,0.518916,0.518916,1.0,0.759458,0.5
4,embedding DC most_freqemb + Mads dc most_freq,"(ColumnTransformer(transformers=[('mads',\n ...",0.518903,0.518903,1.0,0.759452,0.5,0.518916,0.518916,1.0,0.759458,0.5


In [76]:
import pickle

In [77]:
best_model = metrics.iloc[0]['model_instance']

In [78]:
f = open('drive/Shareddrives/Milestone 2/dummy_clf_model.pkl','wb')
pickle.dump(best_model,f)