<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/josh-updates/josh-AoA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [58]:
# import sys
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import clean_wiki as cw  # custom cleaning module

# installers
# !{sys.executable} -m pip install pyspark -q
# !{sys.executable} -m pip install -U spacy -q
# !{sys.executable} -m spacy download en_core_web_lg -q
# !{sys.executable} -m pip install sklearn -q

In [59]:
n = 10000  # sample size (number if > 1 else fraction)

grade_level = [   # school level by age
    (3, 4, 0, 'Foundation'),
    (4, 5, 0, 'Foundation'),
    (5, 6, 1, 'Primary'),
    (6, 7, 1, 'Primary'),
    (7, 8, 1, 'Primary'),
    (8, 9, 1, 'Primary'),
    (9, 10, 1, 'Primary'),
    (10, 11, 1, 'Primary'),
    (11, 12, 2, 'MiddleSchool'),
    (12, 13, 2, 'MiddleSchool'),
    (13, 14, 2, 'MiddleSchool'),
    (14, 15, 3, 'IGCSE'),
    (15, 16, 3, 'IGCSE'),
    (16, 17, 3, 'IB'),
    (17, 18, 3, 'IB'),
    (19, 200, 3, 'College')
]

def grade_finder(age):
  for i in grade_level:
    if i[0] <= age <= i[1]:
      return i[2]
  return -1

## Dataset

In [60]:
df = cw.process_file('WikiLarge_Train.csv')
df.head()

root
 |-- original_text: string (nullable = true)
 |-- label: string (nullable = true)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|original_text                                                                                                                                                                                                                                           |label|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 â '' 11 , and that her niece and nephe

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [61]:
df_sample = df.sample(n) if n >= 1 else df.sample(frac=n)

## Lemmatize (and then some)

In [62]:
nlp = spacy.load('en_core_web_lg')
tokens = []
for doc in nlp.pipe(df_sample['original_text'], n_process=4):
  tokens.append([w.lemma_ for w in doc])

In [63]:
df_AoA = pd.read_csv('AoA_51715_words.csv', encoding= 'unicode_escape')

# load words and ages into dictionary
age = {}
for _, w in df_AoA.iterrows():
  age[w[0]] = w[10]
  if w[0] != w[1]:
    age[w[1]] = w[10]

df_AoA.sample(10)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
33748,pistons,pistons,0.92,Noun,7,7,2,piston,,,10.37,1.0,,,,12.53
19522,gifts,gifts,14.35,Noun,5,5,1,gift,,,5.05,1.0,,5.24,4.79,
38975,rompers,rompers,0.06,Noun,7,6,2,romper,,,13.28,0.95,,,,
15704,epidemiological,epidemiological,0.04,Adjective,15,15,8,epidemiological,15.33,0.95,15.33,0.95,,,,
28869,misunderstand,misunderstand,3.29,Verb,13,12,4,misunderstand,8.52,1.0,8.52,1.0,,,,
49339,vacancy,vacancy,0.88,Noun,7,7,3,vacancy,8.05,1.0,8.05,1.0,,,,
42294,snorer,snorer,0.02,Noun,6,5,2,snorer,7.79,1.0,7.79,1.0,,,,
12379,derma,derma,0.1,Noun,5,4,2,derma,13.93,0.83,13.93,0.83,,,,
1338,amends,amends,1.84,Noun,6,6,2,amends,10.16,1.0,10.16,1.0,,,,
43684,stellar,stellar,0.8,Adjective,7,5,2,stellar,11.53,1.0,11.53,1.0,,,,11.49


## Basic Features

In [64]:
df_sample.head()

Unnamed: 0,original_text,label
25699,"In 2006 , her alma mater , Ku-ring-gai High Sc...",1
90632,He became -LRB- after 1787 -RRB- a student of ...,1
216968,God gave many laws to the Israelites through M...,0
379788,1770 - 14-year old Marie Antoinette marries 15...,0
145262,"Gregorian chant was organized , codified , and...",1


In [65]:
# Basic stats
df_sample['num_lemmas'] = [len(s) for s in tokens]
df_sample['min_age'] = [np.nanmin([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['mean_age'] = [np.nanmean([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['max_age'] = [np.nanmax([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['num_listed'] = [len([w for w in s if age.get(w,0) > 0]) for s in tokens]
df_sample['num_unlisted'] = df_sample['num_lemmas'] - df_sample['num_listed']

# Grade level
df_sample['grade_min_age'] = df_sample['min_age'].apply(lambda x: grade_finder(x))
df_sample['grade_mean_age'] = df_sample['mean_age'].apply(lambda x: grade_finder(x))
df_sample['grade_max_age'] = df_sample['max_age'].apply(lambda x: grade_finder(x))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


## Feature Statistics

In [66]:
means = df_sample.mean()
sdevs = df_sample.std()

mdict = {}
for m in means:
  mdict[means[means==m].index[0]] = m

sdict = {}
for s in sdevs:
  sdict[sdevs[sdevs==s].index[0]] = s

  """Entry point for launching an IPython kernel.
  


## Z-Score Features

In [67]:
for c in df_sample.columns[2:8]:
  df_sample[f'z_{c}'] = (df_sample[c] - means[c]) / sdevs[c]

df_sample.head()

Unnamed: 0,original_text,label,num_lemmas,min_age,mean_age,max_age,num_listed,num_unlisted,grade_min_age,grade_mean_age,grade_max_age,z_num_lemmas,z_min_age,z_mean_age,z_max_age,z_num_listed,z_num_unlisted
25699,"In 2006 , her alma mater , Ku-ring-gai High Sc...",1,21,3.57,4.501429,6.0,7,14,0,0,1,-0.183434,0.283971,-1.298178,-1.573985,-0.809459,0.760749
90632,He became -LRB- after 1787 -RRB- a student of ...,1,13,2.89,4.675714,6.0,7,6,-1,0,1,-0.827485,-0.666035,-1.09333,-1.573985,-0.809459,-0.455015
216968,God gave many laws to the Israelites through M...,0,10,3.95,5.16,8.33,6,4,0,1,1,-1.069004,0.814857,-0.524121,-0.670369,-0.920581,-0.758956
379788,1770 - 14-year old Marie Antoinette marries 15...,0,24,3.72,4.813,5.85,10,14,0,0,1,0.058085,0.493531,-0.93197,-1.632158,-0.476092,0.760749
145262,"Gregorian chant was organized , codified , and...",1,48,3.37,6.062778,16.65,36,12,0,1,3,1.990239,0.004558,0.536966,2.556277,2.413086,0.456808


## Feature Importance

In [68]:
# Nan's break the model
df_clean = df_sample.dropna()

In [69]:
X = df_clean[df_clean.columns[2:]]
y = df_clean['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [71]:
Lasso().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [72]:
pipeline = Pipeline([('model',Lasso(random_state=42))])
search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error", verbose=0)
search.fit(X_train,y_train)
search.best_params_

{'model__alpha': 0.1}

In [73]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
print(importance)

[0.00643487 0.         0.         0.01120401 0.         0.00203461
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]


In [74]:
np.array(df_clean.columns[2:])[importance > 0]

array(['num_lemmas', 'max_age', 'num_unlisted'], dtype=object)

## Regression

In [75]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.62      0.56      0.59       956
           1       0.62      0.67      0.64      1014

    accuracy                           0.62      1970
   macro avg       0.62      0.61      0.61      1970
weighted avg       0.62      0.62      0.61      1970



In [76]:
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)

In [77]:
pca = PCA(n_components=X_scaled.shape[1], random_state=42)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
np.cumsum(pca.explained_variance_ratio_ * 100)

array([ 38.34544673,  65.91754406,  82.27511743,  90.4744043 ,
        94.74181632,  97.73826903,  98.98298085, 100.        ,
       100.        , 100.        , 100.        , 100.        ,
       100.        , 100.        , 100.        ])

In [78]:
plt.figure(figsize=(10,7))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], s=70, hue=y_train, palette=['green','blue'])
plt.show()

In [79]:
sns.set_style("darkgrid", {'axes.grid' : False})

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=y_train, marker='o')
ax.set_xlabel('PCA1')
ax.set_ylabel('PCA2')
ax.set_zlabel('PCA3')

plt.show()