In [1]:
import prepare_data
import pandas as pd
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter
import pyphen
dic = pyphen.Pyphen(lang='en')

# Load Data

In [2]:
british_council_path = "data/british_council/cleaned_articles.csv"

In [4]:
df = prepare_data.load_df(british_council_path,["cleaned1","cleaned2","cleaned3"])
df.head()

Unnamed: 0,article_name,cleaned1,cleaned2,cleaned3
0,walk-forest-level,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...
1,amazing-adventurers-level,[Do you ever dream about climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...
2,animals-city-level,"[One night in December 2011, a bear came into ...","[Recently, there have been many reports in new...","[Small animals like birds, squirrels, mice and..."
3,bully-level,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...
4,cheat-level,[Mo was waiting outside her class. She was fee...,[Mo was waiting in the corridor outside her cl...,[Mo was waiting in the corridor with her class...


In [8]:
#concatenate articles to have one 3 texts per article
concat_df = df.copy()
concat_df[["cleaned1","cleaned2","cleaned3"]] = concat_df[["cleaned1","cleaned2","cleaned3"]].applymap(lambda x: " ".join(x))

# Tokenize and POS Tagging

In [32]:
def tok_pos_tagging(text):
    
    tok_text = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
    tok_pos_text = list(zip(*[list(zip(*nltk.pos_tag(sent))) for sent in tok_text]))
    
    return tok_pos_text

In [35]:
def serie_tok_pos_tagging(series):
    return [tok_pos_tagging(text) for text in series]

In [51]:
tok_pos_df = pd.concat([pd.DataFrame(
        data=serie_tok_pos_tagging(concat_df[level]),
        columns=[level+"_tok",level+"_pos"]) 
 for level in ["cleaned1","cleaned2","cleaned3"]],axis=1)
tok_pos_df["article_name"] = df["article_name"]
tok_pos_df.head()

Unnamed: 0,cleaned1_tok,cleaned1_pos,cleaned2_tok,cleaned2_pos,cleaned3_tok,cleaned3_pos,article_name
0,"((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...","((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...","((Going, through, the, forest, is, my, favouri...","((VBG, IN, DT, NN, VBZ, PRP$, JJ, NN, IN, DT, ...",walk-forest-level
1,"((Do, you, ever, dream, about, climbing, Mount...","((VB, PRP, RB, VBP, IN, VBG, NNP, NNP, CC, VBG...","((Have, you, ever, dreamt, of, climbing, Mount...","((VBP, PRP, RB, VB, IN, VBG, NNP, NNP, CC, VBG...","((Have, you, ever, dreamt, of, climbing, Mount...","((VBP, PRP, RB, VB, IN, VBG, NNP, NNP, CC, VBG...",amazing-adventurers-level
2,"((One, night, in, December, 2011, ,, a, bear, ...","((CD, NN, IN, NNP, CD, ,, DT, NN, VBD, IN, DT,...","((Recently, ,, there, have, been, many, report...","((RB, ,, EX, VBP, VBN, JJ, NNS, IN, NNS, CC, I...","((Small, animals, like, birds, ,, squirrels, ,...","((JJ, NNS, IN, NNS, ,, NNS, ,, NN, CC, NNS, VB...",animals-city-level
3,"((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...","((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...","((Kay, got, another, message, as, she, was, le...","((NNP, VBD, DT, NN, IN, PRP, VBD, VBG, IN, NN,...",bully-level
4,"((Mo, was, waiting, outside, her, class, .), (...","((NNP, VBD, VBG, IN, PRP$, NN, .), (PRP, VBD, ...","((Mo, was, waiting, in, the, corridor, outside...","((NNP, VBD, VBG, IN, DT, NN, IN, PRP$, NN, .),...","((Mo, was, waiting, in, the, corridor, with, h...","((NNP, VBD, VBG, IN, DT, NN, IN, PRP$, NN, IN,...",cheat-level


In [None]:
pos_tok_path = "data/british_council/to"
prepare_data.save_df()