In [2]:
import prepare_data
import pandas as pd
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import pyphen
dic = pyphen.Pyphen(lang='en')

# Load Data

In [7]:
british_council_path = "data/british_council/cleaned_articles.csv"
tok_pos_path = "data/british_council/tok_pos_articles.csv"

In [23]:
level1_col = "level1"
level2_col = "level2"
level3_col = "level3"
all_level_col = [level1_col,level2_col,level3_col]

level1_pos_col = "level1_pos"
level2_pos_col = "level2_pos"
level3_pos_col = "level3_pos"
all_level_pos_col = [level1_pos_col,level2_pos_col,level3_pos_col]

level1_tok_col = "level1_tok"
level2_tok_col = "level2_tok"
level3_tok_col = "level3_tok"
all_level_tok_col = [level1_tok_col,level2_tok_col,level3_tok_col]

level1_lem_col = "level1_lem"
level2_lem_col = "level2_lem"
level3_lem_col = "level3_lem"
all_level_lem_col = [level1_lem_col,level2_lem_col,level3_lem_col]

In [59]:
df = prepare_data.load_df(british_council_path,["cleaned1","cleaned2","cleaned3"])
df.rename(
    index=str,
    columns={
        "cleaned1":level1_col,
        "cleaned2":level2_col,
        "cleaned3":level3_col},
    inplace=True)
df.head()

Unnamed: 0,article_name,level1,level2,level3
0,walk-forest-level,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...
1,amazing-adventurers-level,[Do you ever dream about climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...
2,animals-city-level,"[One night in December 2011, a bear came into ...","[Recently, there have been many reports in new...","[Small animals like birds, squirrels, mice and..."
3,bully-level,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...
4,cheat-level,[Mo was waiting outside her class. She was fee...,[Mo was waiting in the corridor outside her cl...,[Mo was waiting in the corridor with her class...


In [61]:
#concatenate articles to have one 3 texts per article
concat_df = df.copy()
concat_df[all_level_col] = concat_df[all_level_col].applymap(lambda x: " ".join(x))

# Tokenize and POS Tagging

In [32]:
def tok_pos_tagging(text):
    
    tok_text = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
    tok_pos_text = list(zip(*[list(zip(*nltk.pos_tag(sent))) for sent in tok_text]))
    
    return tok_pos_text

In [35]:
def serie_tok_pos_tagging(series):
    return [tok_pos_tagging(text) for text in series]

In [64]:
tok_pos_df = pd.concat([pd.DataFrame(
        data=serie_tok_pos_tagging(concat_df[level]),
        columns=[level+"_tok",level+"_pos"]) 
 for level in all_level_col],axis=1)


In [68]:
tok_pos_df["article_name"] = df["article_name"].values
tok_pos_df.head()

Unnamed: 0,level1_tok,level1_pos,level2_tok,level2_pos,level3_tok,level3_pos,article_name
0,"((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...","((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...","((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...",walk-forest-level
1,"((Do, you, ever, dream, about, climbing, Mount...","((VB, PRP, RB, VBP, IN, VBG, NNP, NNP, CC, VBG...","((Have, you, ever, dreamt, of, climbing, Mount...","((VBP, PRP, RB, VB, IN, VBG, NNP, NNP, CC, VBG...","((Have, you, ever, dreamt, of, climbing, Mount...","((VBP, PRP, RB, VB, IN, VBG, NNP, NNP, CC, VBG...",amazing-adventurers-level
2,"((One, night, in, December, 2011, ,, a, bear, ...","((CD, NN, IN, NNP, CD, ,, DT, NN, VBD, IN, DT,...","((Recently, ,, there, have, been, many, report...","((RB, ,, EX, VBP, VBN, JJ, NNS, IN, NNS, CC, I...","((Small, animals, like, birds, ,, squirrels, ,...","((JJ, NNS, IN, NNS, ,, NNS, ,, NN, CC, NNS, VB...",animals-city-level
3,"((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...","((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...","((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...",bully-level
4,"((Mo, was, waiting, outside, her, class, .), (...","((NNP, VBD, VBG, IN, PRP$, NN, .), (PRP, VBD, ...","((Mo, was, waiting, in, the, corridor, outside...","((NNP, VBD, VBG, IN, DT, NN, IN, PRP$, NN, .),...","((Mo, was, waiting, in, the, corridor, with, h...","((NNP, VBD, VBG, IN, DT, NN, IN, PRP$, NN, IN,...",cheat-level


In [69]:
tok_pos_path = "data/british_council/tok_pos_articles.csv"
prepare_data.save_df(tok_pos_path,tok_pos_df)

## Lemmatizing

In [3]:
 def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
lemmatizer = WordNetLemmatizer()

In [24]:
tok_pos_df = prepare_data.load_df(tok_pos_path,all_level_pos_col+all_level_tok_col)
df = tok_pos_df.copy()

In [25]:
lemmatized_text_list = []
for pos_col,tok_col,lem_col in zip(*[all_level_pos_col,all_level_tok_col,all_level_lem_col]):
    lemmatized_text = df.apply(
        lambda x: [[lemmatizer.lemmatize(word,get_wordnet_pos(tag)) for word,tag in zip(*[sent,tags])]
                   for sent,tags in zip(*[x[tok_col],x[pos_col]])],axis=1)
    df[lem_col] = lemmatized_text
    #lemmatized_text_list += [lemmatized_text]

In [27]:
pos_tok_lem_path = "data/british_council/tok_pos_lem_articles.csv"
prepare_data.save_df(pos_tok_lem_path,df)

# Feature Creation

## Word and Sentence Ratio

In [70]:
def count_sent_word_length(text):
    n_sent = len(text)
    sent_len = [len(words) for words in text]
    n_words = sum(sent_len)
    word_len = [len(word) for words in text for word in words]
    
    mean_word_len = np.mean(word_len)
    mean_sent_len = np.mean(sent_len)
    return mean_word_len,mean_sent_len

In [81]:
list(zip(*tok_pos_df[[level1_tok_col]].applymap(lambda x: count_sent_word_length(x)).values))

[((2.8545627376425857, 8.766666666666667),
  (4.081466395112017, 16.366666666666667),
  (3.903765690376569, 13.277777777777779),
  (3.4817351598173514, 8.342857142857143),
  (3.3550802139037432, 12.98611111111111),
  (3.722910216718266, 15.75609756097561),
  (4.110311750599521, 13.03125),
  (3.3088630259623995, 10.844660194174757),
  (3.5906148867313914, 13.733333333333333),
  (3.831932773109244, 14.875),
  (3.8601769911504427, 13.13953488372093),
  (3.6666666666666665, 18.085714285714285),
  (3.409681227863046, 11.929577464788732),
  (3.2223837209302326, 10.920634920634921),
  (3.3555878084179973, 11.677966101694915),
  (3.9606126914660833, 14.741935483870968),
  (3.5828677839851024, 18.517241379310345),
  (3.233108108108108, 16.0),
  (3.1582867783985105, 12.067415730337078),
  (3.3531468531468533, 10.337349397590362),
  (3.832129963898917, 16.294117647058822),
  (3.6792035398230087, 14.580645161290322))]