<a href="https://colab.research.google.com/github/Tstrebe2/predicting-text-difficulty/blob/josh-updates/josh-AoA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [111]:
# import sys
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
import clean_wiki as cw  # custom cleaning module

# installers
# !{sys.executable} -m pip install pyspark -q
# !{sys.executable} -m pip install -U spacy -q
# !{sys.executable} -m spacy download en_core_web_lg -q
# !{sys.executable} -m pip install sklearn -q

In [None]:
n = 10000  # sample size

## Dataset

In [112]:
df = cw.process_file('WikiLarge_Train.csv')
df.head()

root
 |-- original_text: string (nullable = true)
 |-- label: string (nullable = true)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|original_text                                                                                                                                                                                                                                           |label|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|There is manuscript evidence that Austen continued to work on these pieces as late as the period 1809 â '' 11 , and that her niece and nephe

Unnamed: 0,original_text,label
0,There is manuscript evidence that Austen conti...,1
1,"In a remarkable comparative analysis , Mandaea...",1
2,"Before Persephone was released to Hermes , who...",1
3,Cogeneration plants are commonly found in dist...,1
4,"Geneva -LRB- , ; , ; , ; ; -RRB- is the second...",1


In [113]:
df_sample = df.sample(n) if n < 1 else df.sample(frac=n)

## Lemmatize (and then some)

In [114]:
nlp = spacy.load('en_core_web_lg')
tokens = []
for doc in nlp.pipe(df_sample['original_text']):
  tokens.append([w.lemma_ for w in doc])

In [115]:
df_AoA = pd.read_csv('AoA_51715_words.csv', encoding= 'unicode_escape')

# load words and ages into dictionary
age = {}
for _, w in df_AoA.iterrows():
  age[w[0]] = w[10]
  if w[0] != w[1]:
    age[w[1]] = w[10]

df_AoA.sample(10)

Unnamed: 0,Word,Alternative.spelling,Freq_pm,Dom_PoS_SUBTLEX,Nletters,Nphon,Nsyll,Lemma_highest_PoS,AoA_Kup,Perc_known,AoA_Kup_lem,Perc_known_lem,AoA_Bird_lem,AoA_Bristol_lem,AoA_Cort_lem,AoA_Schock
964,aircraftsman,aircraftsman,0.02,Noun,12,12,4,aircraftsman,11.67,0.95,11.67,0.95,,,,
29707,mutter,mutter,0.43,Verb,6,4,2,mutter,9.5,1.0,9.5,1.0,,,,
21731,hiking,hiking,2.65,Verb,6,5,2,hike,,,9.4,1.0,7.96,,6.6,
8919,commodity,commodity,1.65,Noun,9,8,4,commodity,13.53,1.0,13.53,1.0,,,,
32539,pasting,pasting,0.22,Verb,7,6,2,paste,,,4.84,1.0,,6.41,5.82,
20511,guardianship,guardianship,0.49,Noun,12,10,4,guardianship,9.48,1.0,9.48,1.0,,,,
12071,deliverymen,deliverymen,0.0,,11,11,5,deliveryman,,,7.76,0.94,,,,
21707,highness,highness,15.82,Noun,8,5,2,highness,7.35,1.0,7.35,1.0,,,,
38540,reverts,reverts,0.12,Verb,7,6,2,revert,,,13.11,1.0,11.63,,,10.9
3017,backtracking,backtracking,0.18,Verb,12,9,3,backtrack,,,11.75,1.0,,,,


## Basic Features

In [116]:
df_sample['num_lemmas'] = [len(s) for s in tokens]
df_sample['min_age'] = [np.nanmin([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['mean_age'] = [np.nanmean([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['max_age'] = [np.nanmax([age.get(w, np.nan) for w in s]) for s in tokens]
df_sample['num_listed'] = [len([w for w in s if age.get(w,0) > 0]) for s in tokens]
df_sample['num_unlisted'] = df_sample['num_lemmas'] - df_sample['num_listed']

  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


## Feature Statistics

In [None]:
means = df_sample.mean()
sdevs = df_sample.std()

mdict = {}
for m in means:
  mdict[means[means==m].index[0]] = m

sdict = {}
for s in sdevs:
  sdict[sdevs[sdevs==s].index[0]] = s

## Z-Score Features

In [None]:
for c in df_sample.columns[2:]:
  df_sample[f'z_{c}'] = (df_sample[c] - means[c]) / sdevs[c]

df_sample.head()

## Feature Importance

In [None]:
# Nan's break the model
df_clean = df_sample.dropna()

In [None]:
X = df_clean[df_clean.columns[2:]]
y = df_clean['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [None]:
Lasso().get_params().keys()

In [None]:
pipeline = Pipeline([('model',Lasso(random_state=42))])
search = GridSearchCV(pipeline,{'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error", verbose=0)
search.fit(X_train,y_train)
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
print(importance)

In [None]:
np.array(df_clean.columns[2:])[importance > 0]

## Regression

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))