<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/tim-updates/tim-lr-pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [52]:
train_path = '/content/drive/MyDrive/milestone-ii/Training_set.csv'
test_path = '/content/drive/MyDrive/milestone-ii/Testing_set.csv'

import pandas as pd

df_train = pd.read_csv(train_path, sep='\t', index_col=0)
df_test = pd.read_csv(test_path, sep='\t', index_col=0)

df_train.head(1)

Unnamed: 0,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,label
91224,For instance the number of k-permutations of n...,4.5427,5.14,3.94,9.57,2.5775,1.43,4.55,12.0,1


In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, KBinsDiscretizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [140]:
num_features = ['d_chall_score', 'aoa_mean', 'aoa_min', 'aoa_max',
       'conc_rating_mean', 'conc_rating_min', 'conc_rating_max', 'num_lemmas']

text_features = ['lemmatized_text']

def get_text_list(x):
  return x.values[:, 0]
# This is a necessary wrapper for the function
# so that sklearn will recognize as pipeline component
list_trf = FunctionTransformer(get_text_list)

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()),
    # I'm going to try binning to speed convergence in the
    # face of heterogeneous features (dense numeric values sparse tfidf vectors)
    ('binner', KBinsDiscretizer(n_bins=4)),
])

text_pipe = Pipeline([
    # All this component does is flatten the input to tfidf from 2-d to
    # 1-d array which is what tfidf expects.
    # See above function for more details.
    ('select', list_trf),
    ('tfidf', TfidfVectorizer(min_df=25, stop_words='english', ngram_range=(1, 3))),
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('text', text_pipe, text_features),
])

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(n_jobs=-1))
])

clf = clf.fit(df_train, df_train['label'])

  "decreasing the number of bins." % jj


In [141]:
from sklearn import metrics

y_hat = clf.predict(df_test)
y_proba = clf.predict_proba(df_test)[:, 1]

accuracy = metrics.accuracy_score(df_test['label'], y_hat)
f1 = metrics.f1_score(df_test['label'], y_hat)
roc_auc = metrics.roc_auc_score(df_test['label'], y_proba)

print('Accuracy Score:', accuracy)
print('F1 Score:', f1)
print('ROC AUC:', roc_auc)

Accuracy Score: 0.6839841081053724
F1 Score: 0.7013868962219034
ROC AUC: 0.7544625319021621


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# When using a pipeline in grid search, you can prepend the parameter
# you want to tune with the pipeline component name (clf) and two underscores (__).
# For this example: clf__
param_grid = dict(clf__C=[10, 50, 100])

grid = GridSearchCV(clf, 
                    param_grid=param_grid,
                    scoring='roc_auc', 
                    cv=StratifiedKFold(n_splits=5), 
                    verbose=3,
                    n_jobs=-1)

grid.fit(df_train, df_train['label'])

In [108]:
grid.best_params_

{'clf__C': 10}

In [155]:
# get transformed feature names
num_features_trf = (clf['preprocessor']
                    .transformers_[0][1]
                    ['binner']
                    .get_feature_names_out(num_features))

text_features_trf = (clf['preprocessor']
                 .transformers_[1][1]['tfidf']
                 .get_feature_names_out())

all_features_trf = list(num_features_trf) + list(text_features_trf)

In [162]:
import numpy as np

df_coef = pd.DataFrame({'feature':all_features_trf, 
                        'coef':clf['clf'].coef_[0],
                        'abscoef':np.abs(clf['clf'].coef_[0])})
# What are top 10 most important features
df_coef.sort_values('abscoef', ascending=False).head(10)

Unnamed: 0,feature,coef,abscoef
14686,ndash,14.151836,14.151836
8971,footballer,7.075945,7.075945
5652,commune region,-6.457823,6.457823
6800,department north france,-6.166001,6.166001
15057,north france,-6.103729,6.103729
22800,western france,5.948519,5.948519
15095,northern france,5.638217,5.638217
6133,county iowa united,5.56121,5.56121
8955,football player,-5.561145,5.561145
13049,lot,-5.407976,5.407976


In [163]:
# Just how important are our MADS features
df_coef[df_coef.feature.isin(num_features_trf)].sort_values('abscoef', ascending=False).head(10)

Unnamed: 0,feature,coef,abscoef
27,num_lemmas_0.0,-0.5105,0.5105
30,num_lemmas_3.0,0.3782,0.3782
0,d_chall_score_0.0,-0.374541,0.374541
3,d_chall_score_3.0,0.319486,0.319486
12,aoa_max_0.0,-0.28999,0.28999
15,aoa_max_3.0,0.187149,0.187149
4,aoa_mean_0.0,-0.179706,0.179706
29,num_lemmas_2.0,0.154688,0.154688
7,aoa_mean_3.0,0.118883,0.118883
2,d_chall_score_2.0,0.113302,0.113302
