![](images/EscUpmPolit_p.gif "UPM")

In order to use nltk, we should download first the lexical resources we are going to use. We can updated them later. For this, you need:
* install nltk. Execute 'pip install nltk'
* import nltk
* Run *nltk.download()* (the first time we use it). A window will appear. You should select just the corpus 'book' and press download.
If you inspect the window, you can get an overview of available lexical resources (corpora, lexicons and grammars). For example, you can find some relevant sentiment lexicons in corpora (SentiWordNet, Sentence Polarity Dataset, Vader, Opinion Lexicon or VADER Sentiment Lexicon). Don't forget to close the window once the data has been downloaded.

In [24]:
# pip install openpyxl
# pip install nltk
# pip install pandas
# pip install "pandas[excel]"
# pip install numpy

In [25]:
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('universal_tagset')
#nltk.download()


In [26]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import os

In [27]:
def read_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        filename, _ = os.path.splitext(os.path.basename(file_path))
    return text, filename


# lemaArray => [(id, word, tag_universal, tag_upenn)]

def merge_postagged(lemaArray1, lemaArray2):
    result_array = []
    for i in range(len(lemaArray1)):
        key = i
        word = lemaArray1[i][0]
        value = lemaArray1[i][1]
        value2 = lemaArray2[i][1]
        result_array.append((key, word, value, value2))

    # Convert the merged dictionary back to a list of tuples
    return result_array

def generateDfLemas(lemaArray):
    columns = ['Index', 'Word', 'POS_Universal', 'POS_UPenn']
    return pd.DataFrame(lemaArray, columns=columns)
    
def merge_words_and_count(dfLemas):
    return dfLemas.groupby(['Word', 'POS_Universal', 'POS_UPenn']).size().reset_index(name='Count').sort_values(by='Count', ascending=False)


def generate_text_from_lemma_array(lemaArray):
    return [" ".join(w for _, w, _, _ in lemaArray)][0]

def save_data_text(folder, name, text):
    if not os.path.exists(folder):
        os.makedirs(folder)
    file_path = os.path.join(folder, name + '.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)    

def save_data_frame(folder, name, df):
    if not os.path.exists(folder):
        os.makedirs(folder)

    file_path = os.path.join(folder, name + '.xlsx')
    df.to_excel(file_path, index=False)

In [28]:

text, filename = read_text_from_file('./94.txt')

# Tokenization

In [29]:
words = word_tokenize(text)
print(words)

['Some', 'people', 'prefer', 'to', 'live', 'in', 'places', 'that', 'have', 'the', 'same', 'weather', 'or', 'climate', 'all', 'year', 'long', '.', 'Others', 'like', 'to', 'live', 'in', 'areas', 'where', 'the', 'weather', 'changes', 'several', 'times', 'a', 'year', '.', 'Which', 'do', 'you', 'prefer', '?', 'Use', 'specific', 'reasons', 'and', 'examples', 'to', 'support', 'your', 'choice', '.', 'I', 'was', 'born', 'in', 'a', 'part', 'of', 'the', 'world', 'where', 'the', 'climate', 'has', 'four', 'seasons', 'and', 'I', 'learnt', 'how', 'much', 'beauty', 'each', 'of', 'these', 'periods', 'can', 'bring', 'to', 'the', 'environments', 'and', 'our', 'lives', '.', 'I', 'prefer', 'to', 'live', 'in', 'an', 'area', 'where', 'the', 'weather', 'changes', 'several', 'times', 'a', 'year', '.', 'Although', 'sometimes', 'I', 'experience', 'difficulties', 'in', 'the', 'cold', 'winters', 'because', 'of', 'my', 'bad', 'tolerance', 'to', 'low', 'temperatures', ',', 'I', 'can', 'not', 'imagine', 'the', 'Chris

# Stemming and Lemmatization

In [30]:
text_postagged_universal = pos_tag(word_tokenize(text), tagset='universal')
text_postagged_upenn = pos_tag(word_tokenize(text))
text_postagged = merge_postagged(text_postagged_universal, text_postagged_upenn)

text_postagged

[(0, 'Some', 'DET', 'DT'),
 (1, 'people', 'NOUN', 'NNS'),
 (2, 'prefer', 'VERB', 'VBP'),
 (3, 'to', 'PRT', 'TO'),
 (4, 'live', 'VERB', 'VB'),
 (5, 'in', 'ADP', 'IN'),
 (6, 'places', 'NOUN', 'NNS'),
 (7, 'that', 'DET', 'WDT'),
 (8, 'have', 'VERB', 'VBP'),
 (9, 'the', 'DET', 'DT'),
 (10, 'same', 'ADJ', 'JJ'),
 (11, 'weather', 'NOUN', 'NN'),
 (12, 'or', 'CONJ', 'CC'),
 (13, 'climate', 'NOUN', 'NN'),
 (14, 'all', 'DET', 'DT'),
 (15, 'year', 'NOUN', 'NN'),
 (16, 'long', 'ADV', 'RB'),
 (17, '.', '.', '.'),
 (18, 'Others', 'NOUN', 'NNS'),
 (19, 'like', 'ADP', 'IN'),
 (20, 'to', 'PRT', 'TO'),
 (21, 'live', 'VERB', 'VB'),
 (22, 'in', 'ADP', 'IN'),
 (23, 'areas', 'NOUN', 'NNS'),
 (24, 'where', 'ADV', 'WRB'),
 (25, 'the', 'DET', 'DT'),
 (26, 'weather', 'NOUN', 'NN'),
 (27, 'changes', 'VERB', 'VBZ'),
 (28, 'several', 'ADJ', 'JJ'),
 (29, 'times', 'NOUN', 'NNS'),
 (30, 'a', 'DET', 'DT'),
 (31, 'year', 'NOUN', 'NN'),
 (32, '.', '.', '.'),
 (33, 'Which', 'NOUN', 'NNP'),
 (34, 'do', 'VERB', 'VBP'),
 (3

In [31]:
# lemmatization without punctuations and lower
pos_mapping = {'NOUN': 'n', 'ADJ': 'a', 'VERB': 'v', 'ADV': 'r', 'ADP': 'n', 'CONJ': 'n', 
               'PRON': 'n', 'NUM': 'n', 'DET':'n', 'PRT':'n', 'X': 'n', 'ADJ_SAT': 's' }

wordnet = WordNetLemmatizer()
lemmas_without_punctuation_lower = [(id, wordnet.lemmatize(w, pos=pos_mapping[tag_universal]).lower(), tag_universal, tag_upenn)  for (id, w,tag_universal, tag_upenn)  in text_postagged if tag_universal in pos_mapping.keys()]
text_lemmas_without_punctuation_lower  = generate_text_from_lemma_array(lemmas_without_punctuation_lower)

# print(lemmas_without_punctuation_lower)
# print(text_lemmas_without_punctuation_lower)

df_lemmas_without_punctuation_lower = generateDfLemas(lemmas_without_punctuation_lower)
df_lemmas_without_punctuation_lower_unique_count = merge_words_and_count(df_lemmas_without_punctuation_lower)


save_data_text('./'+filename, filename+'_1_'+'text_lemmas_without_punctuation_lower', text_lemmas_without_punctuation_lower)
save_data_frame('./'+filename, filename+'_1_'+'df_lemmas_without_punctuation_lower', df_lemmas_without_punctuation_lower)
save_data_frame('./'+filename, filename+'_1_'+'df_lemmas_without_punctuation_lower_unique_count', df_lemmas_without_punctuation_lower_unique_count)

df_lemmas_without_punctuation_lower_unique_count


Unnamed: 0,Word,POS_Universal,POS_UPenn,Count
149,the,DET,DT,37
80,in,ADP,IN,14
77,i,PRON,PRP,13
10,and,CONJ,CC,11
1,a,DET,DT,10
...,...,...,...,...
72,have,VERB,VBP,1
74,holiday,NOUN,NN,1
75,how,ADV,WRB,1
76,hue,NOUN,NNS,1


In [32]:
# - without stop words
stoplist = stopwords.words('english')
#print(stoplist)


lemmas_without_stop_words = [lemma for lemma in lemmas_without_punctuation_lower if lemma[1] not in stoplist]
text_lemmas_without_stop_words = generate_text_from_lemma_array(lemmas_without_stop_words)

# print(lemmas_without_stop_words)
# print(text_lemmas_without_stop_words)

df_lemmas_without_stop_words = generateDfLemas(lemmas_without_stop_words)
df_lemmas_without_stop_words_unique_count = merge_words_and_count(df_lemmas_without_stop_words)


save_data_text('./'+filename, filename+'_2_'+'text_lemmas_without_stop_words', text_lemmas_without_stop_words)
save_data_frame('./'+filename, filename+'_2_'+'df_lemmas_without_stop_words', df_lemmas_without_stop_words)
save_data_frame('./'+filename, filename+'_2_'+'df_lemmas_without_stop_words_unique_count', df_lemmas_without_stop_words_unique_count)

df_lemmas_without_stop_words_unique_count


Unnamed: 0,Word,POS_Universal,POS_UPenn,Count
129,year,NOUN,NN,6
23,climate,NOUN,NN,5
63,live,VERB,VB,5
86,place,NOUN,NNS,4
123,winter,NOUN,NN,3
...,...,...,...,...
43,first,ADJ,JJ,1
42,favorite,ADJ,JJ,1
40,fairy,ADJ,JJ,1
39,experience,VERB,VBP,1


In [33]:
# - only nouns, adjectives, verbs, and adverbs

pos_mapping = {'NOUN': 'n', 'X': 'n', 'ADJ': 'a', 'VERB': 'v', 'ADV': 'r', 'ADJ_SAT': 's' }

lemmas_only_noun_verb_adj_adv = [lemma for lemma in lemmas_without_stop_words if lemma[2] in pos_mapping.keys()]
text_lemmas_only_noun_verb_adj_adv = generate_text_from_lemma_array(lemmas_without_stop_words)

# print(lemmas_only_noun_verb_adj_adv)
# print(text_lemmas_only_noun_verb_adj_adv)


df_lemmas_only_noun_verb_adj_adv = generateDfLemas(lemmas_only_noun_verb_adj_adv)
df_lemmas_only_noun_verb_adj_adv_unique_count = merge_words_and_count(df_lemmas_only_noun_verb_adj_adv)


save_data_text('./'+filename, filename+'_3_'+'text_lemmas_only_noun_verb_adj_adv', text_lemmas_only_noun_verb_adj_adv)
save_data_frame('./'+filename, filename+'_3_'+'df_lemmas_only_noun_verb_adj_adv', df_lemmas_only_noun_verb_adj_adv)
save_data_frame('./'+filename, filename+'_3_'+'df_lemmas_only_noun_verb_adj_adv_unique_count', df_lemmas_only_noun_verb_adj_adv_unique_count)

df_lemmas_only_noun_verb_adj_adv_unique_count

Unnamed: 0,Word,POS_Universal,POS_UPenn,Count
124,year,NOUN,NN,6
59,live,VERB,VB,5
22,climate,NOUN,NN,5
82,place,NOUN,NNS,4
9,beauty,NOUN,NN,3
...,...,...,...,...
42,flavor,NOUN,NN,1
41,first,ADJ,JJ,1
40,favorite,ADJ,JJ,1
38,fairy,ADJ,JJ,1


In [34]:
save_data_text('./'+filename, filename+'_0_original', text)
