<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/tim-updates/notebooks/feature-ablation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, ParameterGrid, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
repo_path = '/content/drive/MyDrive/milestone-ii/'
train_path = ''.join([repo_path, 'Training_set.csv'])
test_path = ''.join([repo_path, 'Testing_set.csv'])

df_train = pd.read_csv(train_path, sep='\t', index_col='ix')
df_train['lemmatized_text'] = df_train['lemmatized_text'].fillna('')
df_test = pd.read_csv(test_path, sep='\t', index_col='ix')
df_test['lemmatized_text'] = df_test['lemmatized_text'].fillna('')

df = pd.concat([df_train, df_test])
print(df.shape)
df.head(2)

(416768, 11)


Unnamed: 0_level_0,original_text,lemmatized_text,d_chall_score,aoa_mean,aoa_min,aoa_max,conc_rating_mean,conc_rating_min,conc_rating_max,num_lemmas,label
ix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,There is manuscript evidence that Austen conti...,there be manuscript evidence that austen conti...,8.236551,5.80931,3.57,12.12,2.495517,1.33,4.57,37.0,1
1,"In a remarkable comparative analysis , Mandaea...",in a remarkable comparative analysis mandaean ...,12.320171,7.499286,2.89,11.94,2.251429,1.46,3.77,21.0,1


In [4]:
readability_features = ['d_chall_score',	'aoa_mean',	'aoa_min',	'aoa_max',	
                        'conc_rating_mean',	'conc_rating_min',	'conc_rating_max']

text_features = 'lemmatized_text'

def build_clf(tfidf_kwargs={}, clf_kwargs={}, include_num_features=True):
  num_pipe = Pipeline([
      ('impute', SimpleImputer(strategy='mean')),
      ('scale', StandardScaler()),
      ('binner', KBinsDiscretizer(n_bins=4)),
  ])

  text_pipe = Pipeline([
      ('tfidf', TfidfVectorizer(**tfidf_kwargs)),
  ])

  if include_num_features:
    preprocessor = [('num', num_pipe, readability_features), 
                    ('text', text_pipe, text_features),]
  else:
    preprocessor = [('text', text_pipe, text_features)]

  preprocessor = ColumnTransformer(preprocessor)

  clf = Pipeline([
      ('preprocessor', preprocessor),
      ('clf', LinearSVC(**clf_kwargs))
  ])
  return clf

In [9]:
TfidfVectorizer(**dict(min_df=50, ngram_range=(1, 3))).fit_transform(df['lemmatized_text']).shape

(416768, 29036)

In [22]:
clf_kwargs=dict(dual=False, C=.5, penalty='l1', loss='squared_hinge', random_state=99)

all_feature_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3)), 
                            clf_kwargs=clf_kwargs)

text_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3)), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

text_5000_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3), max_features=5000), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

text_2500_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3), max_features=2500), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

text_1000_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3), max_features=1000), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

text_500_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3), max_features=500), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

text_100_clf = build_clf(tfidf_kwargs=dict(min_df=50, ngram_range=(1, 3), max_features=100), 
                     clf_kwargs=clf_kwargs,
                     include_num_features=False)

clf_list = [(all_feature_clf, 'All Features (Readability + 29,036 TFIDF Features)'), 
            (text_clf,      'Remove Readability Features'), 
            (text_5000_clf, 'Reduce to Top 5000 TFIDF Features'), 
            (text_2500_clf, 'Reduce to Top 2500 TFIDF Features'), 
            (text_1000_clf, 'Reduce to Top 1000 TFIDF Features'), 
            (text_500_clf,  'Reduce to Top 500 TFIDF Features'), 
            (text_100_clf,  'Reduce to Top 100 TFIDF Features'), ]

In [None]:
def get_feature_ablation_results():
  cv_result_dict = {}
  cv = StratifiedKFold(n_splits=10)

  for clf, step_title in clf_list:
    cv_results = cross_validate(clf, df, df.label, n_jobs=-1, scoring=['roc_auc', 'accuracy', 'f1'], cv=cv)
    cv_result_dict[step_title] = cv_results

  return cv_result_dict

cv_result_dict = get_feature_ablation_results()

In [None]:
import pickle

filename = 'feature-ablation-results'
file_path = ''.join([repo_path, filename])

with open(file_path, 'wb') as fh:
  pickle.dump(cv_result_dict, fh)

In [None]:
with open(file_path, 'rb') as fh:
  print(pickle.load(fh, encoding='bytes').keys())