In [2]:
import pandas as pd
import numpy as np
import pickle

import re
from nltk.tokenize import word_tokenize

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
# preprocessing functions

def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return x

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x

def get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

# Load Embeddings
def load_glove(word_index):
    EMBEDDING_FILE = "./data/misc/glove6B/glove.6B.300d.txt"
    
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8'))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
        else:
            embedding_vector = embeddings_index.get(word.capitalize())
            if embedding_vector is not None: 
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [4]:
tatoeba_df = pd.read_pickle('./data/misc/tatoeba_level_annotated_sentences.pkl')
sentences_corpus = tatoeba_df

In [5]:
count_df = sentences_corpus[['Sentences','Labels']].groupby('Labels').aggregate({'Sentences':'count'}).reset_index().sort_values('Sentences',ascending=False)
count_df.head()

Unnamed: 0,Labels,Sentences
0,A,106682
1,B,82270
2,C,69983


In [6]:
# obtain the max length of our sentences in order to compromise the model in such a length
sentences_corpus['len'] = sentences_corpus['Sentences'].apply(lambda s : len(s))
max_len = sentences_corpus.len.tolist()
maxlen = max(max_len)

# hyperparameters

# how big is each word vector
embed_size = 300 

# how many unique words to use (i.e num rows in embedding vector)
max_features = 120000 

# max number of words in a sentence to use for padding
maxlen = maxlen 

# how many samples to process at once
batch_size = 128

# how many times to iterate over all samples
n_epochs = 7

# learning rate
learn_rate = 0.001

# Number of K-fold Splits
n_splits = 5 

# network hidden size
hid_size = 64

# dropout 
dropout = 0.8

SEED = 10
debug = 0

# this is a list of expanded abbreviations
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

contractions, contractions_re = get_contractions(contraction_dict)

# Balance the data according to the minimum class value
num_of_levels = 69900
shuffled = sentences_corpus.reindex(np.random.permutation(sentences_corpus.index))

A = shuffled[shuffled['Labels'] == 'A'][:num_of_levels]
B = shuffled[shuffled['Labels'] == 'B'][:num_of_levels]
C = shuffled[shuffled['Labels'] == 'C'][:num_of_levels]

balanced_df = pd.concat([A,B,C], ignore_index=True)

#Shuffle the dataset
balanced_df = balanced_df.reindex(np.random.permutation(balanced_df.index))
balanced_df.reset_index(drop=True)

#  lower the text
balanced_df["Sentences"] = balanced_df["Sentences"].apply(lambda x: x.lower())

# Clean the text
balanced_df["Sentences"] = balanced_df["Sentences"].apply(lambda x: clean_text(x))

# Clean numbers
balanced_df["Sentences"] = balanced_df["Sentences"].apply(lambda x: clean_numbers(x))

# Clean Contractions
balanced_df["Sentences"] = balanced_df["Sentences"].apply(lambda x: replace_contractions(x))

In [7]:
train_X, test_X, train_y, test_y = train_test_split(balanced_df['Sentences'], balanced_df['Labels'],
                                                    stratify=balanced_df['Labels'], 
                                                    test_size=0.25)

print("Train shape : ",train_X.shape)
print("Test shape : ",test_X.shape)

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

le = LabelEncoder()
train_y = le.fit_transform(train_y.values)
test_y = le.transform(test_y.values)

# missing entries in the embedding are set using np.random.normal so we have to seed here too
if debug:
    embedding_matrix = np.random.randn(120000,300)
else:
    embedding_matrix = load_glove(tokenizer.word_index)

Train shape :  (157275,)
Test shape :  (52425,)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
class BiLSTM(nn.Module):
    
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.hidden_size = hid_size
        drp = dropout
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(self.hidden_size*4 , 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(drp)
        self.out = nn.Linear(64, n_classes)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_lstm, _ = self.lstm(h_embedding)
        avg_pool = torch.mean(h_lstm, 1)
        max_pool, _ = torch.max(h_lstm, 1)
        conc = torch.cat(( avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        return out

model = BiLSTM()
loss_fn = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learn_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Load train and test in CUDA Memory
x_train = torch.tensor(train_X, dtype=torch.long).to(device)
y_train = torch.tensor(train_y, dtype=torch.long).to(device)
x_cv = torch.tensor(test_X, dtype=torch.long).to(device)
y_cv = torch.tensor(test_y, dtype=torch.long).to(device)

# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

train_loss = []
valid_loss = []

for epoch in range(n_epochs):

    # Set model to train configuration
    model.train()
    avg_loss = 0.  
    
    for i, (x_batch, y_batch) in enumerate(train_loader):
        # Predict/Forward Pass
        y_pred = model(x_batch)
        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
    
    # Set model to validation configuration -Doesn't get trained here
    model.eval()        
    avg_val_loss = 0.
    val_preds = np.zeros((len(x_cv),len(le.classes_)))
    
    for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        # keep/store predictions
        val_preds[i * batch_size:(i+1) * batch_size] =F.softmax(y_pred).cpu().numpy()
    
    # Check Accuracy
    val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    train_loss.append(avg_loss)
    valid_loss.append(avg_val_loss)
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f}'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, val_accuracy))



Epoch 1/7 	 loss=76.5460 	 val_loss=42.2324  	 val_acc=0.8741
Epoch 2/7 	 loss=43.4965 	 val_loss=32.4633  	 val_acc=0.9095
Epoch 3/7 	 loss=34.1802 	 val_loss=27.9225  	 val_acc=0.9252
Epoch 4/7 	 loss=28.2110 	 val_loss=24.5541  	 val_acc=0.9371
Epoch 5/7 	 loss=23.6511 	 val_loss=23.0467  	 val_acc=0.9443
Epoch 6/7 	 loss=20.7079 	 val_loss=21.5434  	 val_acc=0.9478
Epoch 7/7 	 loss=18.1706 	 val_loss=22.2631  	 val_acc=0.9497


In [10]:
# function for the final prediction
def predict_single(x):
      
    # lower the text
    x = x.lower()
    # Clean the text
    x =  clean_text(x)
    # Clean numbers
    x =  clean_numbers(x)
    # Clean Contractions
    x = replace_contractions(x)
    # tokenize
    x = tokenizer.texts_to_sequences([x])
    # pad
    x = pad_sequences(x, maxlen=maxlen)
    # create dataset
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    x = torch.tensor(x, dtype=torch.long).to(device)

    pred = model(x).detach()
    pred = F.softmax(pred).cpu().numpy()

    pred = pred.argmax(axis=1)

    pred = le.classes_[pred]
    return pred[0]

## Parse with difficulty level the Wikipedia corpus

In [11]:
wiki_corpus = pd.read_pickle('./data/misc/wikipedia_sentences_parsed_selection.pkl')

In [12]:
wiki_corpus

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,Reuven Rivlin has been the President since Jul...,"[Reuven, Rivlin, has, been, the, President, si...","[reuven, rivlin, have, be, the, president, sin...","[PROPN, PROPN, AUX, AUX, DET, NOUN, ADP, PROPN...","[NNP, NNP, VBZ, VBN, DT, NN, IN, NNP, CD, .]","[(president, nsubj), (reuven, flat), (presiden...","[Number=Sing, Number=Sing, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[6, 1, 6, 6, 6, 0, 8, 6, 8, 6]"
1,The volcanic soil of the islands proved to be ...,"[The, volcanic, soil, of, the, islands, proved...","[the, volcanic, soil, of, the, island, prove, ...","[DET, ADJ, NOUN, ADP, DET, NOUN, VERB, PART, A...","[DT, JJ, NN, IN, DT, NNS, VBD, TO, VB, JJ, IN,...","[(soil, det), (soil, amod), (proved, nsubj), (...","[Definite=Def|PronType=Art, Degree=Pos, Number...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[3, 3, 7, 6, 6, 3, 0, 10, 10, 7, 13, 13, 10, 7]"
2,"After the Sharpeville Massacre, the UN tried t...","[After, the, Sharpeville, Massacre, ,, the, UN...","[after, the, sharpeville, massacre, ,, the, un...","[ADP, DET, ADJ, NOUN, PUNCT, DET, PROPN, VERB,...","[IN, DT, JJ, NN, ,, DT, NNP, VBD, TO, VB, JJ, ...","[(massacre, case), (massacre, det), (massacre,...","[_, Definite=Def|PronType=Art, Degree=Pos, Num...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 8, 8, 7, 8, 0, 10, 8, 12, 10, 14, 10..."
3,The paws have three soft toe pads and retracti...,"[The, paws, have, three, soft, toe, pads, and,...","[the, paw, have, three, soft, toe, pad, and, r...","[DET, NOUN, VERB, NUM, ADJ, NOUN, NOUN, CCONJ,...","[DT, NNS, VBP, CD, JJ, NN, NNS, CC, JJ, NNS, .]","[(paws, det), (have, nsubj), (root, root), (pa...","[Definite=Def|PronType=Art, Number=Plur, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[2, 3, 0, 7, 7, 7, 3, 10, 10, 7, 3]"
4,The stone is on the ice in front of the foot i...,"[The, stone, is, on, the, ice, in, front, of, ...","[the, stone, be, on, the, ice, in, front, of, ...","[DET, NOUN, AUX, ADP, DET, NOUN, ADP, NOUN, AD...","[DT, NN, VBZ, IN, DT, NN, IN, NN, IN, DT, NN, ...","[(stone, det), (ice, nsubj), (ice, cop), (ice,...","[Definite=Def|PronType=Art, Number=Sing, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 6, 6, 6, 6, 0, 8, 6, 11, 11, 8, 14, 14, 8, 6]"
...,...,...,...,...,...,...,...,...,...
216178,"In a few other dictatorships, such as Saudi Ar...","[In, a, few, other, dictatorships, ,, such, as...","[in, a, few, other, dictatorship, ,, such, as,...","[ADP, DET, ADJ, ADJ, NOUN, PUNCT, ADJ, ADP, AD...","[IN, DT, JJ, JJ, NNS, ,, JJ, IN, JJ, NNP, ,, D...","[(dictatorships, case), (dictatorships, det), ...","[_, Definite=Ind|PronType=Art, Degree=Pos, Deg...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 16, 5, 10, 7, 10, 5, 16, 14, 14, ..."
216179,Abstract art is modern art which does not repr...,"[Abstract, art, is, modern, art, which, does, ...","[abstract, art, be, modern, art, which, do, no...","[ADJ, NOUN, AUX, ADJ, NOUN, PRON, AUX, PART, V...","[JJ, NN, VBZ, JJ, NN, WDT, VBZ, RB, VB, NNS, I...","[(art, amod), (art, nsubj), (art, cop), (art, ...","[Degree=Pos, Number=Sing, Mood=Ind|Number=Sing...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 9, 9, 9, 5, 9, 14, 14, 14, 10, 5]"
216181,This is a casual relationship is usually only ...,"[This, is, a, casual, relationship, is, usuall...","[this, be, a, casual, relationship, be, usuall...","[PRON, AUX, DET, ADJ, NOUN, AUX, ADV, ADV, ADP...","[DT, VBZ, DT, JJ, NN, VBZ, RB, RB, IN, NN, CC,...","[(relationship, nsubj), (relationship, cop), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 10, 10, 10, 10, 10, 0, 15, 15, 15..."
216182,It also cost about $3.9 billion.,"[It, also, cost, about, $, 3.9, billion, .]","[it, also, cost, about, $, 3.9, billion, .]","[PRON, ADV, VERB, ADV, SYM, NUM, NUM, PUNCT]","[PRP, RB, VBD, RB, $, CD, CD, .]","[(cost, nsubj), (cost, advmod), (root, root), ...",[Case=Nom|Gender=Neut|Number=Sing|Person=3|Pro...,"[1, 2, 3, 4, 5, 6, 7, 8]","[3, 3, 0, 5, 3, 7, 5, 3]"


In [13]:
wiki_level_predictions = []
for i, row in wiki_corpus.iterrows():
    wiki_level_predictions.append(predict_single(row['Sentence']))

wiki_corpus['Level'] = wiki_level_predictions



In [14]:
wiki_corpus

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head,Level
0,Reuven Rivlin has been the President since Jul...,"[Reuven, Rivlin, has, been, the, President, si...","[reuven, rivlin, have, be, the, president, sin...","[PROPN, PROPN, AUX, AUX, DET, NOUN, ADP, PROPN...","[NNP, NNP, VBZ, VBN, DT, NN, IN, NNP, CD, .]","[(president, nsubj), (reuven, flat), (presiden...","[Number=Sing, Number=Sing, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[6, 1, 6, 6, 6, 0, 8, 6, 8, 6]",A
1,The volcanic soil of the islands proved to be ...,"[The, volcanic, soil, of, the, islands, proved...","[the, volcanic, soil, of, the, island, prove, ...","[DET, ADJ, NOUN, ADP, DET, NOUN, VERB, PART, A...","[DT, JJ, NN, IN, DT, NNS, VBD, TO, VB, JJ, IN,...","[(soil, det), (soil, amod), (proved, nsubj), (...","[Definite=Def|PronType=Art, Degree=Pos, Number...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[3, 3, 7, 6, 6, 3, 0, 10, 10, 7, 13, 13, 10, 7]",B
2,"After the Sharpeville Massacre, the UN tried t...","[After, the, Sharpeville, Massacre, ,, the, UN...","[after, the, sharpeville, massacre, ,, the, un...","[ADP, DET, ADJ, NOUN, PUNCT, DET, PROPN, VERB,...","[IN, DT, JJ, NN, ,, DT, NNP, VBD, TO, VB, JJ, ...","[(massacre, case), (massacre, det), (massacre,...","[_, Definite=Def|PronType=Art, Degree=Pos, Num...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 8, 8, 7, 8, 0, 10, 8, 12, 10, 14, 10...",B
3,The paws have three soft toe pads and retracti...,"[The, paws, have, three, soft, toe, pads, and,...","[the, paw, have, three, soft, toe, pad, and, r...","[DET, NOUN, VERB, NUM, ADJ, NOUN, NOUN, CCONJ,...","[DT, NNS, VBP, CD, JJ, NN, NNS, CC, JJ, NNS, .]","[(paws, det), (have, nsubj), (root, root), (pa...","[Definite=Def|PronType=Art, Number=Plur, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]","[2, 3, 0, 7, 7, 7, 3, 10, 10, 7, 3]",B
4,The stone is on the ice in front of the foot i...,"[The, stone, is, on, the, ice, in, front, of, ...","[the, stone, be, on, the, ice, in, front, of, ...","[DET, NOUN, AUX, ADP, DET, NOUN, ADP, NOUN, AD...","[DT, NN, VBZ, IN, DT, NN, IN, NN, IN, DT, NN, ...","[(stone, det), (ice, nsubj), (ice, cop), (ice,...","[Definite=Def|PronType=Art, Number=Sing, Mood=...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 6, 6, 6, 6, 0, 8, 6, 11, 11, 8, 14, 14, 8, 6]",B
...,...,...,...,...,...,...,...,...,...,...
216178,"In a few other dictatorships, such as Saudi Ar...","[In, a, few, other, dictatorships, ,, such, as...","[in, a, few, other, dictatorship, ,, such, as,...","[ADP, DET, ADJ, ADJ, NOUN, PUNCT, ADJ, ADP, AD...","[IN, DT, JJ, JJ, NNS, ,, JJ, IN, JJ, NNP, ,, D...","[(dictatorships, case), (dictatorships, det), ...","[_, Definite=Ind|PronType=Art, Degree=Pos, Deg...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 16, 5, 10, 7, 10, 5, 16, 14, 14, ...",B
216179,Abstract art is modern art which does not repr...,"[Abstract, art, is, modern, art, which, does, ...","[abstract, art, be, modern, art, which, do, no...","[ADJ, NOUN, AUX, ADJ, NOUN, PRON, AUX, PART, V...","[JJ, NN, VBZ, JJ, NN, WDT, VBZ, RB, VB, NNS, I...","[(art, amod), (art, nsubj), (art, cop), (art, ...","[Degree=Pos, Number=Sing, Mood=Ind|Number=Sing...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 9, 9, 9, 5, 9, 14, 14, 14, 10, 5]",B
216181,This is a casual relationship is usually only ...,"[This, is, a, casual, relationship, is, usuall...","[this, be, a, casual, relationship, be, usuall...","[PRON, AUX, DET, ADJ, NOUN, AUX, ADV, ADV, ADP...","[DT, VBZ, DT, JJ, NN, VBZ, RB, RB, IN, NN, CC,...","[(relationship, nsubj), (relationship, cop), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[5, 5, 5, 5, 10, 10, 10, 10, 10, 0, 15, 15, 15...",B
216182,It also cost about $3.9 billion.,"[It, also, cost, about, $, 3.9, billion, .]","[it, also, cost, about, $, 3.9, billion, .]","[PRON, ADV, VERB, ADV, SYM, NUM, NUM, PUNCT]","[PRP, RB, VBD, RB, $, CD, CD, .]","[(cost, nsubj), (cost, advmod), (root, root), ...",[Case=Nom|Gender=Neut|Number=Sing|Person=3|Pro...,"[1, 2, 3, 4, 5, 6, 7, 8]","[3, 3, 0, 5, 3, 7, 5, 3]",A


In [16]:
wiki_corpus.reset_index(drop=True, inplace=True)

In [18]:
wiki_corpus.to_pickle('./data/wikipedia_corpus_complete.pkl')  

## Parse with difficulty level the BNC corpus

In [19]:
bnc_corpus = pd.read_pickle('./data/misc/bnc_sentences_parsed_selection.pkl')

In [20]:
bnc_corpus

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head
0,The interaction of long chain molecules with l...,"[The, interaction, of, long, chain, molecules,...","[the, interaction, of, long, chain, molecule, ...","[DET, NOUN, ADP, ADJ, NOUN, NOUN, ADP, NOUN, A...","[DT, NN, IN, JJ, NN, NNS, IN, NNS, VBZ, IN, JJ...","[(interaction, det), (interest, nsubj), (molec...","[Definite=Def|PronType=Art, Number=Sing, _, De...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 12, 6, 6, 6, 2, 8, 6, 12, 12, 12, 0, 19, 1..."
1,When an amorphous polymer is mixed with a suit...,"[When, an, amorphous, polymer, is, mixed, with...","[when, a, amorphous, polymer, be, mix, with, a...","[SCONJ, DET, ADJ, NOUN, AUX, VERB, ADP, DET, A...","[WRB, DT, JJ, NN, VBZ, VBN, IN, DT, JJ, NN, ,,...","[(mixed, mark), (polymer, det), (polymer, amod...","[PronType=Int, Definite=Ind|PronType=Art, Degr...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 4, 4, 6, 6, 13, 10, 10, 10, 6, 13, 13, 0, ..."
2,"In a ' poor ' solvent, the interactions are fe...","[In, a, ', poor, ', solvent, ,, the, interacti...","[in, a, ', poor, ', solvent, ,, the, interacti...","[ADP, DET, PUNCT, ADJ, PUNCT, NOUN, PUNCT, DET...","[IN, DT, ``, JJ, '', NN, ,, DT, NNS, VBP, JJR,...","[(solvent, case), (solvent, det), (solvent, pu...","[_, Definite=Ind|PronType=Art, _, Degree=Pos, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 6, 6, 6, 6, 11, 11, 9, 11, 11, 0, 18, 14, ..."
3,The fundamental thermodynamic equation used to...,"[The, fundamental, thermodynamic, equation, us...","[the, fundamental, thermodynamic, equation, us...","[DET, ADJ, ADJ, NOUN, VERB, PART, VERB, DET, N...","[DT, JJ, JJ, NN, VBN, TO, VB, DT, NNS, VBZ, DT...","[(equation, det), (equation, amod), (equation,...","[Definite=Def|PronType=Art, Degree=Pos, Degree...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 10, 4, 7, 5, 9, 7, 0, 16, 16, 16, 15..."
4,This is valid only for components of comparabl...,"[This, is, valid, only, for, components, of, c...","[this, be, valid, only, for, component, of, co...","[PRON, AUX, ADJ, ADV, ADP, NOUN, ADP, ADJ, NOU...","[DT, VBZ, JJ, RB, IN, NNS, IN, JJ, NN, ,, CC, ...","[(valid, nsubj), (valid, cop), (root, root), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[3, 3, 0, 6, 6, 3, 9, 9, 6, 24, 24, 24, 15, 15..."
...,...,...,...,...,...,...,...,...,...
197201,Nonetheless I must stand my ground and restate...,"[Nonetheless, I, must, stand, my, ground, and,...","[nonetheless, I, must, stand, my, ground, and,...","[ADV, PRON, AUX, VERB, PRON, NOUN, CCONJ, VERB...","[RB, PRP, MD, VB, PRP$, NN, CC, VB, IN, RB, RB...","[(stand, advmod), (stand, nsubj), (stand, aux)...","[_, Case=Nom|Number=Sing|Person=1|PronType=Prs...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 0, 6, 4, 8, 4, 14, 14, 14, 14, 14, 8..."
197202,How much longer will everyone ignore this phen...,"[How, much, longer, will, everyone, ignore, th...","[how, much, long, will, everyone, ignore, this...","[ADV, ADV, ADV, AUX, PRON, VERB, DET, NOUN, PU...","[WRB, RB, RBR, MD, NN, VB, DT, NN, .]","[(much, advmod), (longer, advmod), (ignore, ad...","[PronType=Int, Degree=Pos, Degree=Cmp, VerbFor...","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 3, 6, 6, 6, 0, 8, 6, 6]"
197204,"If so, it shows the town suffering more than i...","[If, so, ,, it, shows, the, town, suffering, m...","[if, so, ,, it, show, the, town, suffer, more,...","[SCONJ, ADV, PUNCT, PRON, VERB, DET, NOUN, VER...","[IN, RB, ,, PRP, VBZ, DT, NN, VBG, JJR, IN, PR...","[(so, mark), (shows, advcl), (shows, punct), (...","[_, _, _, Case=Nom|Gender=Neut|Number=Sing|Per...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 7, 5, 7, 8, 13, 13, 13, 8, 17,..."
197205,I doubt if many Scottish historians would take...,"[I, doubt, if, many, Scottish, historians, wou...","[I, doubt, if, many, scottish, historian, woul...","[PRON, VERB, SCONJ, ADJ, ADJ, NOUN, AUX, VERB,...","[PRP, VBP, IN, JJ, JJ, NNS, MD, VB, DT, NN, RB...","[(doubt, nsubj), (root, root), (take, mark), (...","[Case=Nom|Number=Sing|Person=1|PronType=Prs, M...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[2, 0, 8, 6, 6, 8, 8, 2, 10, 8, 8, 2]"


In [21]:
bnc_level_predictions = []
for i, row in bnc_corpus.iterrows():
    bnc_level_predictions.append(predict_single(row['Sentence']))

bnc_corpus['Level'] = bnc_level_predictions



In [22]:
bnc_corpus

Unnamed: 0,Sentence,Tokens,Lemma,Upos,Xpos,Dependency,Features,id,Head,Level
0,The interaction of long chain molecules with l...,"[The, interaction, of, long, chain, molecules,...","[the, interaction, of, long, chain, molecule, ...","[DET, NOUN, ADP, ADJ, NOUN, NOUN, ADP, NOUN, A...","[DT, NN, IN, JJ, NN, NNS, IN, NNS, VBZ, IN, JJ...","[(interaction, det), (interest, nsubj), (molec...","[Definite=Def|PronType=Art, Number=Sing, _, De...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 12, 6, 6, 6, 2, 8, 6, 12, 12, 12, 0, 19, 1...",B
1,When an amorphous polymer is mixed with a suit...,"[When, an, amorphous, polymer, is, mixed, with...","[when, a, amorphous, polymer, be, mix, with, a...","[SCONJ, DET, ADJ, NOUN, AUX, VERB, ADP, DET, A...","[WRB, DT, JJ, NN, VBZ, VBN, IN, DT, JJ, NN, ,,...","[(mixed, mark), (polymer, det), (polymer, amod...","[PronType=Int, Definite=Ind|PronType=Art, Degr...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 4, 4, 6, 6, 13, 10, 10, 10, 6, 13, 13, 0, ...",B
2,"In a ' poor ' solvent, the interactions are fe...","[In, a, ', poor, ', solvent, ,, the, interacti...","[in, a, ', poor, ', solvent, ,, the, interacti...","[ADP, DET, PUNCT, ADJ, PUNCT, NOUN, PUNCT, DET...","[IN, DT, ``, JJ, '', NN, ,, DT, NNS, VBP, JJR,...","[(solvent, case), (solvent, det), (solvent, pu...","[_, Definite=Ind|PronType=Art, _, Degree=Pos, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[6, 6, 6, 6, 6, 11, 11, 9, 11, 11, 0, 18, 14, ...",B
3,The fundamental thermodynamic equation used to...,"[The, fundamental, thermodynamic, equation, us...","[the, fundamental, thermodynamic, equation, us...","[DET, ADJ, ADJ, NOUN, VERB, PART, VERB, DET, N...","[DT, JJ, JJ, NN, VBN, TO, VB, DT, NNS, VBZ, DT...","[(equation, det), (equation, amod), (equation,...","[Definite=Def|PronType=Art, Degree=Pos, Degree...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 10, 4, 7, 5, 9, 7, 0, 16, 16, 16, 15...",C
4,This is valid only for components of comparabl...,"[This, is, valid, only, for, components, of, c...","[this, be, valid, only, for, component, of, co...","[PRON, AUX, ADJ, ADV, ADP, NOUN, ADP, ADJ, NOU...","[DT, VBZ, JJ, RB, IN, NNS, IN, JJ, NN, ,, CC, ...","[(valid, nsubj), (valid, cop), (root, root), (...","[Number=Sing|PronType=Dem, Mood=Ind|Number=Sin...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[3, 3, 0, 6, 6, 3, 9, 9, 6, 24, 24, 24, 15, 15...",B
...,...,...,...,...,...,...,...,...,...,...
197201,Nonetheless I must stand my ground and restate...,"[Nonetheless, I, must, stand, my, ground, and,...","[nonetheless, I, must, stand, my, ground, and,...","[ADV, PRON, AUX, VERB, PRON, NOUN, CCONJ, VERB...","[RB, PRP, MD, VB, PRP$, NN, CC, VB, IN, RB, RB...","[(stand, advmod), (stand, nsubj), (stand, aux)...","[_, Case=Nom|Number=Sing|Person=1|PronType=Prs...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[4, 4, 4, 0, 6, 4, 8, 4, 14, 14, 14, 14, 14, 8...",B
197202,How much longer will everyone ignore this phen...,"[How, much, longer, will, everyone, ignore, th...","[how, much, long, will, everyone, ignore, this...","[ADV, ADV, ADV, AUX, PRON, VERB, DET, NOUN, PU...","[WRB, RB, RBR, MD, NN, VB, DT, NN, .]","[(much, advmod), (longer, advmod), (ignore, ad...","[PronType=Int, Degree=Pos, Degree=Cmp, VerbFor...","[1, 2, 3, 4, 5, 6, 7, 8, 9]","[2, 3, 6, 6, 6, 0, 8, 6, 6]",B
197204,"If so, it shows the town suffering more than i...","[If, so, ,, it, shows, the, town, suffering, m...","[if, so, ,, it, show, the, town, suffer, more,...","[SCONJ, ADV, PUNCT, PRON, VERB, DET, NOUN, VER...","[IN, RB, ,, PRP, VBZ, DT, NN, VBG, JJR, IN, PR...","[(so, mark), (shows, advcl), (shows, punct), (...","[_, _, _, Case=Nom|Gender=Neut|Number=Sing|Per...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[2, 5, 5, 5, 0, 7, 5, 7, 8, 13, 13, 13, 8, 17,...",B
197205,I doubt if many Scottish historians would take...,"[I, doubt, if, many, Scottish, historians, wou...","[I, doubt, if, many, scottish, historian, woul...","[PRON, VERB, SCONJ, ADJ, ADJ, NOUN, AUX, VERB,...","[PRP, VBP, IN, JJ, JJ, NNS, MD, VB, DT, NN, RB...","[(doubt, nsubj), (root, root), (take, mark), (...","[Case=Nom|Number=Sing|Person=1|PronType=Prs, M...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[2, 0, 8, 6, 6, 8, 8, 2, 10, 8, 8, 2]",C


In [23]:
bnc_corpus.reset_index(drop=True, inplace=True)

In [24]:
bnc_corpus.to_pickle('./data/bnc_corpus_complete.pkl')  