<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/josh-updates/josh-AoA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [34]:
# import sys
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import clean_wiki as cw  # custom cleaning module

# installers
# !{sys.executable} -m pip install pyspark -q
# !{sys.executable} -m pip install -U spacy -q
# !{sys.executable} -m spacy download en_core_web_lg -q
# !{sys.executable} -m pip install sklearn -q

In [17]:
n = 10000  # sample size (number if > 1 else fraction)

grade_level = [   # school level by age
    (3, 4, 0, 'Foundation'),
    (4, 5, 0, 'Foundation'),
    (5, 6, 1, 'Primary'),
    (6, 7, 1, 'Primary'),
    (7, 8, 1, 'Primary'),
    (8, 9, 1, 'Primary'),
    (9, 10, 1, 'Primary'),
    (10, 11, 1, 'Primary'),
    (11, 12, 2, 'MiddleSchool'),
    (12, 13, 2, 'MiddleSchool'),
    (13, 14, 2, 'MiddleSchool'),
    (14, 15, 3, 'IGCSE'),
    (15, 16, 3, 'IGCSE'),
    (16, 17, 3, 'IB'),
    (17, 18, 3, 'IB'),
    (19, 200, 3, 'College')
]

def grade_finder(age):
  for i in grade_level:
    if i[0] <= age <= i[1]:
      return i[2]
  return -1

## Dataset

In [18]:
df = cw.process_file('WikiLarge_Train.csv')
df.head()

root
 |-- original_text: string (nullable = true)
 |-- label: string (nullable = true)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|original_text                                                                                                                                                                                                                                           |label|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 â '' 11 , and that her niece and nephe

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [19]:
df_sample = df.sample(n) if n >= 1 else df.sample(frac=n)

## Lemmatize (and then some)

In [20]:
nlp = spacy.load('en_core_web_lg')
tokens = []
for doc in nlp.pipe(df_sample['original_text'], n_process=4):
  tokens.append([w.lemma_ for w in doc])

In [21]:
df_AoA = pd.read_csv('AoA_51715_words.csv', encoding= 'unicode_escape')

# load words and ages into dictionary
age = {}
for _, w in df_AoA.iterrows():
  age[w[0]] = w[10]
  if w[0] != w[1]:
    age[w[1]] = w[10]

df_AoA.sample(10)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
31249,orangutan,orang-utan,0.57,Noun,9,8,4,orangutan,7.83,0.86,7.83,0.86,,,,
23534,indolent,indolent,0.08,Adjective,8,8,3,indolent,11.36,0.61,11.36,0.61,,,,
36796,realtors,realtors,0.27,Noun,8,7,3,realtor,,,11.71,1.0,,,,
7895,cider,cider,1.9,Noun,5,4,2,cider,5.83,0.95,5.83,0.95,,8.56,,7.72
17548,fiscal,fiscal,0.76,Adjective,6,6,2,fiscal,14.89,1.0,14.89,1.0,,,,12.53
45455,takeoffs,takeoffs,0.16,Noun,8,6,2,takeoff,,,7.35,1.0,,,,
27468,maniacal,maniacal,0.53,Adjective,8,8,4,maniacal,13.29,0.94,13.29,0.94,,,,
13881,dominion,dominion,1.14,Noun,8,8,3,dominion,12.44,0.89,12.44,0.89,,,,
38110,rescuer,rescuer,0.27,Noun,7,7,3,rescuer,8.25,1.0,8.25,1.0,,,,
31486,outfitters,outfitters,0.06,Noun,10,7,3,outfitter,,,14.16,0.95,,,,


## Basic Features

In [22]:
df_sample.head()

Unnamed: 0,original_text,label
335824,Each 15 seconds that passed without the secret...,0
207180,The Tiger-Cats of the Canadian Football League...,0
276128,"You will get a warning , above the edit box , ...",0
355858,"While they were at it , some chose to change t...",0
366038,"Praeger , 2001 online version Further reading ...",0


In [23]:
# Basic stats
df_sample['num_lemmas'] = [len(s) for s in tokens]
df_sample['min_age'] = [np.nanmin([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['mean_age'] = [np.nanmean([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['max_age'] = [np.nanmax([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['num_listed'] = [len([w for w in s if age.get(w,0) > 0]) for s in tokens]
df_sample['num_unlisted'] = df_sample['num_lemmas'] - df_sample['num_listed']

# Grade level
df_sample['grade_min_age'] = df_sample['min_age'].apply(lambda x: grade_finder(x))
df_sample['grade_mean_age'] = df_sample['mean_age'].apply(lambda x: grade_finder(x))
df_sample['grade_max_age'] = df_sample['max_age'].apply(lambda x: grade_finder(x))

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


## Feature Statistics

In [24]:
means = df_sample.mean()
sdevs = df_sample.std()

mdict = {}
for m in means:
  mdict[means[means==m].index[0]] = m

sdict = {}
for s in sdevs:
  sdict[sdevs[sdevs==s].index[0]] = s

  """Entry point for launching an IPython kernel.
  


## Z-Score Features

In [25]:
for c in df_sample.columns[2:8]:
  df_sample[f'z_{c}'] = (df_sample[c] - means[c]) / sdevs[c]

df_sample.head()

Unnamed: 0,original_text,label,num_lemmas,min_age,mean_age,max_age,num_listed,num_unlisted,grade_min_age,grade_mean_age,grade_max_age,z_num_lemmas,z_min_age,z_mean_age,z_max_age,z_num_listed,z_num_unlisted
335824,Each 15 seconds that passed without the secret...,0,29,2.89,5.1095,8.21,20,9,-1,1,1,0.450132,-0.608494,-0.56945,-0.748722,0.626612,0.001193
207180,The Tiger-Cats of the Canadian Football League...,0,15,3.98,4.13,4.55,5,10,0,0,0,-0.681138,0.817037,-1.684099,-2.178923,-1.063067,0.152159
276128,"You will get a warning , above the edit box , ...",0,21,2.89,5.49875,10.33,16,5,-1,1,1,-0.196308,-0.608494,-0.126492,0.079701,0.176031,-0.602672
355858,"While they were at it , some chose to change t...",0,16,3.95,5.276429,7.79,14,2,0,1,1,-0.600333,0.777802,-0.379489,-0.912843,-0.04926,-1.05557
366038,"Praeger , 2001 online version Further reading ...",0,25,3.98,5.83,10.25,8,17,0,1,1,0.126912,0.817037,0.250463,0.04844,-0.725131,1.208921


## Feature Importance

In [26]:
# Nan's break the model
df_clean = df_sample.dropna()

In [27]:
X = df_clean[df_clean.columns[2:]]
y = df_clean['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [29]:
Lasso().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [30]:
pipeline = Pipeline([('model',Lasso(random_state=42))])
search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error", verbose=0)
search.fit(X_train,y_train)
search.best_params_

{'model__alpha': 0.1}

In [31]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
print(importance)

[0.00775473 0.         0.         0.01428164 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]


In [32]:
np.array(df_clean.columns[2:])[importance > 0]

array(['num_lemmas', 'max_age'], dtype=object)

## Regression

In [33]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.67      0.57      0.62      1004
           1       0.61      0.70      0.65       968

    accuracy                           0.64      1972
   macro avg       0.64      0.64      0.63      1972
weighted avg       0.64      0.64      0.63      1972



In [35]:
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)

In [38]:
pca = PCA(n_components=X_scaled.shape[1], random_state=42)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
np.cumsum(pca.explained_variance_ratio_ * 100)

array([ 37.55513408,  65.95255979,  82.06777289,  90.16526132,
        94.54790249,  97.63377173,  98.93012519, 100.        ,
       100.        , 100.        , 100.        , 100.        ,
       100.        , 100.        , 100.        ])

In [52]:
plt.figure(figsize=(10,7))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], s=70, hue=y_train, palette=['green','blue'])
plt.show()

In [55]:
sns.set_style("darkgrid", {'axes.grid' : False})

fig = plt.figure(figsize=(10,10))

ax = fig.add_subplot(111, projection='3d')

ax.scatter(X_pca[:,0], X_pca[:,1], X_pca[:,2], c=y_train, marker='o')
ax.set_xlabel('PCA1')
ax.set_ylabel('PCA2')
ax.set_zlabel('PCA3')

plt.show()