<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/dave-updates/code/dave-eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score,precision_score, recall_score

import nltk
import collections

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/696/processed.csv')

In [4]:
train_df = train_df[['original_text','processed_text','label']]
train_df['processed_text'] = train_df['processed_text'].astype('U')

In [5]:
train_df.head()

Unnamed: 0,original_text,processed_text,label
0,There is manuscript evidence that Austen conti...,there is manuscript evidence that austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",in a remarkable comparative analysis mandaean...,1
2,"Before Persephone was released to Hermes , who...",before persephone was released to hermes who ...,1
3,Cogeneration plants are commonly found in dist...,cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",geneva is the second-most-populous ci...,1


In [6]:
easy_bigrams = collections.Counter()
diff_bigrams = collections.Counter()

In [7]:
train_df['token_text'] = train_df['processed_text'].apply(lambda x: [word for word in x.split() if word not in set(['','in','and','of','the','is','on','a'])] )
train_df.head()

Unnamed: 0,original_text,processed_text,label,token_text
0,There is manuscript evidence that Austen conti...,there is manuscript evidence that austen conti...,1,"[there, manuscript, evidence, that, austen, co..."
1,"In a remarkable comparative analysis , Mandaea...",in a remarkable comparative analysis mandaean...,1,"[remarkable, comparative, analysis, mandaean, ..."
2,"Before Persephone was released to Hermes , who...",before persephone was released to hermes who ...,1,"[before, persephone, was, released, to, hermes..."
3,Cogeneration plants are commonly found in dist...,cogeneration plants are commonly found in dist...,1,"[cogeneration, plants, are, commonly, found, d..."
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",geneva is the second-most-populous ci...,1,"[geneva, second-most-populous, city, switzerla..."


In [8]:
def CheckBiGrams(tokens,cntr):
    cntr.update(nltk.bigrams(tokens))

diff_train = train_df[train_df['label']==1]
easy_train = train_df[train_df['label']==0]

_  = diff_train.apply(lambda x: CheckBiGrams(x['token_text'],diff_bigrams),axis=1)
_ = easy_train.apply(lambda x: CheckBiGrams(x['token_text'],easy_bigrams),axis=1)

In [9]:
# 50 Most common bigrams for the difficult text
diff_bigrams.most_common(50)

[(('united', 'states'), 5610),
 (('known', 'as'), 5173),
 (('it', 'was'), 4222),
 (('to', 'be'), 3977),
 (('such', 'as'), 3766),
 (('he', 'was'), 3423),
 (('an', 'american'), 2521),
 (('as', 'well'), 2150),
 (('has', 'been'), 2128),
 (('can', 'be'), 2056),
 (('was', 'an'), 1935),
 (('well', 'as'), 1917),
 (('northern', 'france'), 1795),
 (('th', 'century'), 1789),
 (('also', 'known'), 1630),
 (('from', 'to'), 1630),
 (('new', 'york'), 1600),
 (('have', 'been'), 1587),
 (('was', 'born'), 1573),
 (('referred', 'to'), 1541),
 (('region', 'france'), 1460),
 (('for', 'his'), 1440),
 (('according', 'to'), 1409),
 (('football', 'player'), 1396),
 (('pas-de-calais', 'department'), 1384),
 (('commune', 'pas-de-calais'), 1382),
 (('there', 'are'), 1356),
 (('they', 'are'), 1354),
 (('as', 'an'), 1318),
 (('used', 'to'), 1309),
 (('due', 'to'), 1295),
 (('nord-pas-de-calais', 'region'), 1286),
 (('department', 'nord-pas-de-calais'), 1282),
 (('known', 'for'), 1282),
 (('to', 'as'), 1251),
 (('was

In [10]:
# 50 most common bigrams for easy to understand text
easy_bigrams.most_common(50)

[(('it', 'found'), 5362),
 (('found', 'region'), 5190),
 (('united', 'states'), 4985),
 (('it', 'was'), 4977),
 (('he', 'was'), 3915),
 (('football', 'player'), 3374),
 (('to', 'be'), 3019),
 (('known', 'as'), 2751),
 (('north', 'france'), 2320),
 (('department', 'north'), 2313),
 (('such', 'as'), 2295),
 (('can', 'be'), 2030),
 (('commune', 'it'), 1994),
 (('there', 'are'), 1847),
 (('an', 'american'), 1842),
 (('they', 'are'), 1622),
 (('was', 'an'), 1524),
 (('has', 'been'), 1496),
 (('was', 'born'), 1470),
 (('was', 'first'), 1393),
 (('new', 'york'), 1367),
 (('it', 'has'), 1334),
 (('used', 'to'), 1324),
 (('northwest', 'france'), 1284),
 (('department', 'northwest'), 1282),
 (('pas-de-calais', 'department'), 1240),
 (('th', 'century'), 1238),
 (('region', 'nord-pas-de-calais'), 1227),
 (('nord-pas-de-calais', 'pas-de-calais'), 1227),
 (('have', 'been'), 1145),
 (('from', 'to'), 1110),
 (('as', 'well'), 1095),
 (('de', 'la'), 1082),
 (('aisne', 'department'), 1075),
 (('region', 

In [11]:
# Evaluating whether there is a difference between mean token lengths between the easy/difficult texts
train_df.groupby('label')['token_text'].apply(lambda x: np.mean(x.str.len()))

label
0    11.765145
1    16.005403
Name: token_text, dtype: float64

In [12]:
# There's a clear difference in length between easy to understand text (average of 11.8 tokens) and the more difficult text to (average of 16 tokens)

In [13]:
# Evaluating what "features" are most important in our classification task...exploring what data we get back for tf-idf vectorizers

In [14]:
vectorizer = TfidfVectorizer(
                            min_df= 20,
                            max_df=.995,
                            max_features=5000,
                            ngram_range=(1,3),
                            stop_words='english')

In [15]:
vec_data = vectorizer.fit_transform(train_df.processed_text)

In [16]:
y_labels = train_df.label

In [17]:
# Using mutual_info_classif to extract the feature importance from the entire training set
import warnings
warnings.filterwarnings('ignore')
result = dict(zip(vectorizer.get_feature_names(), 
                  mutual_info_classif(vec_data, 
                                      y_labels, 
                                      discrete_features=True)))

In [18]:
sorted_result = sorted(result.items(), key=lambda item: item[1])

In [19]:
# Top 20 "feature" (aka 'words') important for claissification
sorted_result[-20:]

[('time', 0.012053018296130796),
 ('people', 0.012232722733924437),
 ('south', 0.012259152842987195),
 ('football', 0.014174188345892378),
 ('called', 0.014542851377924122),
 ('united states', 0.015462789170423914),
 ('world', 0.015636974477164053),
 ('american', 0.015675827779041106),
 ('used', 0.015823359744921977),
 ('north', 0.016469491454107817),
 ('new', 0.016631391644847573),
 ('states', 0.01796370503739088),
 ('region', 0.019575067111825103),
 ('commune', 0.019755155277451853),
 ('united', 0.020034728942348038),
 ('known', 0.0213453263959197),
 ('city', 0.02212504588623921),
 ('department', 0.022516699950628324),
 ('france', 0.026594631233516845),
 ('born', 0.028090085442300523)]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(train_df.processed_text, y_labels, test_size=.25)

In [21]:
vec_data = vectorizer.fit_transform(X_train)

In [22]:
mnb = MultinomialNB().fit(vec_data, y_train)

In [23]:
mnb.score(vec_data, y_train)

0.6523661445536445

In [24]:
X_test_vec =  vectorizer.transform(X_test)
y_predict = mnb.predict(X_test_vec)
accuracy = mnb.score(X_test_vec, y_test)
recall = recall_score(y_test, y_predict)
precision = precision_score(y_test,y_predict)
print(f'accuracy: {accuracy}', 
      f'recall: {recall}',
      f'precision: {precision}')



accuracy: 0.6354614557739557 recall: 0.6507933473523226 precision: 0.6332638863052941


In [25]:
# Using CountVectorizer instead of tf-idf

In [26]:
cnt_vectorizer = CountVectorizer(
                            min_df= 20,
                            max_df=.995,
                            max_features=5000,
                            ngram_range=(1,3),
                            stop_words='english')

In [27]:
X_train_cnt_vec = cnt_vectorizer.fit_transform(X_train)

In [28]:
mnb = MultinomialNB().fit(X_train_cnt_vec , y_train)

In [29]:
mnb.score(X_train_cnt_vec, y_train)

0.6428324631449631

In [30]:
X_test_cnt_vec =  cnt_vectorizer.transform(X_test)
y_predict = mnb.predict(X_test_cnt_vec)
accuracy = mnb.score(X_test_cnt_vec, y_test)
recall = recall_score(y_test, y_predict)
precision = precision_score(y_test,y_predict)
print(f'accuracy: {accuracy}', 
      f'recall: {recall}',
      f'precision: {precision}')


accuracy: 0.6274761977886978 recall: 0.6724526859109157 precision: 0.6186858027297031


In [31]:
# There appears to be a slight improvement in recall when using the CountVectorizer vs the Tf-idf vec