In [52]:
#importing required libraries
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
import numpy as np
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
#loading dataset
from google.colab import drive
drive.mount('/content/drive/')
df = pd.read_csv('/content/drive/My Drive/MLTechnologies_Course/11-0.txt', delimiter='\t')
df.head()

Mounted at /content/drive/


Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll"
0,This eBook is for the use of anyone anywhere i...
1,most other parts of the world at no cost and w...
2,"whatsoever. You may copy it, give it away or r..."
3,of the Project Gutenberg License included with...
4,www.gutenberg.org. If you are not located in t...


In [3]:
#reduce unrelated beginning
df_reduce_beginning=df.drop(labels=range(0,32),axis=0)
df_reduce_beginning.head()

Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll"
32,CHAPTER I.
33,Down the Rabbit-Hole
34,Alice was beginning to get very tired of sitti...
35,"bank, and of having nothing to do: once or twi..."
36,"the book her sister was reading, but it had no..."


In [4]:
#reduce unrelated ending
df_reduce_ending=df_reduce_beginning.drop(labels=range(2509,df_reduce_beginning.shape[0]+32), axis=0)
df_reduce_ending.tail()

Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll"
2504,perhaps even with the dream of Wonderland of l...
2505,"would feel with all their simple sorrows, and ..."
2506,"their simple joys, remembering her own child-l..."
2507,days.
2508,THE END


In [5]:
#reindexing obtained dataset
data=df_reduce_ending.reset_index(drop=True)
data.head()

Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll"
0,CHAPTER I.
1,Down the Rabbit-Hole
2,Alice was beginning to get very tired of sitti...
3,"bank, and of having nothing to do: once or twi..."
4,"the book her sister was reading, but it had no..."


In [114]:
#preprocessing of the text
def clean_text(text):
  lemmatizer = WordNetLemmatizer()
  stop_words = stopwords.words("english")
  text = re.sub(r"[^\w\s]", "", text, re.UNICODE)
  text = text.lower()
  lemmatized_text=[]
  for token in text.split(" "):
    if (token==''):
      continue
    speech_part=nltk.tag.pos_tag([token], tagset='universal', lang='eng')[0][1]
    if (speech_part=='VERB'):
      lemmatized_text.append(lemmatizer.lemmatize(token,'v'))
    elif (speech_part=='NOUN'):
      lemmatized_text.append(lemmatizer.lemmatize(token,'n'))
    elif (speech_part=='ADJ'):
      lemmatized_text.append(lemmatizer.lemmatize(token,'a'))
    elif (speech_part=='ADV'):
      lemmatized_text.append(lemmatizer.lemmatize(token,'r'))
    else:
      lemmatized_text.append(lemmatizer.lemmatize(token))
  text = [word for word in lemmatized_text if not word in stop_words]
  text = " ".join(text)
  return text


data["processed_text"] = data["The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll"].apply(lambda x: clean_text(x))

In [115]:
data

Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll",processed_text
0,CHAPTER I.,chapter
1,Down the Rabbit-Hole,rabbithole
2,Alice was beginning to get very tired of sitti...,alice begin get tire sit sister
3,"bank, and of having nothing to do: once or twi...",bank nothing twice peeped
4,"the book her sister was reading, but it had no...",book sister reading picture
...,...,...
2472,perhaps even with the dream of Wonderland of l...,perhaps even dream wonderland long ago
2473,"would feel with all their simple sorrows, and ...",would feel simple sorrow find pleasure
2474,"their simple joys, remembering her own child-l...",simple joy remember childlife happy summer
2475,days.,day


In [69]:
#show indexes of each chapter in order to split the whole text
data.loc[data["processed_text"].str.startswith('chapt')]

Unnamed: 0,"The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll",processed_text
0,CHAPTER I.,chapter
183,CHAPTER II.,chapter ii
356,CHAPTER III.,chapter iii
513,CHAPTER IV.,chapter iv
730,CHAPTER V.,chapter v
944,CHAPTER VI.,chapter vi
1180,CHAPTER VII.,chapter vii
1413,CHAPTER VIII.,chapter viii
1645,CHAPTER IX.,chapter ix
1870,CHAPTER X.,chapter x


In [116]:
#splitting text into chapters
chapter_one=data[0:183]
chapter_two=data[183:356]
chapter_three=data[356:513]
chapter_four=data[513:730]
chapter_five=data[730:944]
chapter_six=data[944:1180]
chapter_seven=data[1180:1413]
chapter_eight=data[1413:1645]
chapter_nine=data[1645:1870]
chapter_ten=data[1870:2081]
chapter_eleven=data[2081:2263]
chapter_twelve=data[2263:len(data)]
chapters=[chapter_one,chapter_two,chapter_three,chapter_four,chapter_five,chapter_six,chapter_seven,chapter_eight,chapter_nine,chapter_ten,chapter_eleven,chapter_twelve]

In [71]:
#function which finds all words in given piece of text(chapter) and returns word array
def get_words(chapter):
  words=[]
  for string in chapter['processed_text'].tolist():
    a=re.findall(r'\w+', string)
    if len(a)!=0:
      for word in a:
        words.append(word)
  return words

In [72]:
#function which defines how many times this word occured in a given chapter
def word_frequency_in_chapter(word,chapter):
  n_word=0
  for i,string in enumerate(chapter['processed_text'].tolist()):
    n_word+=len(re.findall(f'{word}', string))
  return n_word

In [73]:
#calculation of term frequency given a word and a chapter
def calculate_tf(word,chapter):
  total_number=len(get_words(chapter))
  return word_frequency_in_chapter(word,chapter)/total_number


In [74]:
#function which defines the presense of the word in a given chapter
def is_in_chapter(word,chapter):
  if (word_frequency_in_chapter(word,chapter)==0):
    return 0
  else:
    return 1

In [75]:
#calculation of inverse document frequency of a given word
def calculate_idf(word):
  D=len(chapters)
  d=0
  for chapter in chapters:
    d+=is_in_chapter(word,chapter)
  return np.log10(D/d)

In [76]:
#calculation of tf_idf metric given a word and a chapter
def calculate_tfidf(word,chapter):
  return calculate_tf(word,chapter)*calculate_idf(word)

In [118]:
#obtaining table of tf_idfs for each word in each chapter
unique_words=list(set(get_words(data)))
chap=[]
table=pd.DataFrame(columns=unique_words)
for chapter in chapters:
  for word in unique_words:
    chap=np.append(chap,calculate_tfidf(word,chapter))
  row=pd.DataFrame(chap.reshape(1,len(chap)),columns=unique_words)
  table=pd.concat([table,row], ignore_index=True)
  chap=[]

In [119]:
table

Unnamed: 0,piecrust,faint,_very_,advance,clap,earth,advise,despair,pool,brushing,butterfly,wild,afterwards,eleventh,youll,shinglewill,havent,passion,stretch,court,drip,hop,neatly,somewhere,hjckrrh,could,conquest,scold,general,effect,caught,ringlet,ashamed,zealand,scramble,consider,flowerbed,taught,hurry,brother,...,song,alice,concert,miss,hide,chatte,authority,fountain,claw,sweettempered,occur,consent,head,fading,poky,seat,set,beor,pas,hippopotamus,glad,norman,hell,rightly,forgot,waynever,wait,nine,longer,onion,titter,stick,bed,crash,lovely,salmon,red,alices,altogether,toss
0,0.0,0.0,0.000714,0.0,0.0,0.000968,0.001095,0.0,0.0,0.0,0.0,0.000611,0.000789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000475,0.0,0.001578,0.0,0.0,0.0,0.001095,0.000357,0.0,0.0,0.0,0.0,0.001095,0.0,0.000475,0.0,0.000789,0.0,0.0,...,0.0,0.0,0.0,0.000484,0.0,0.0,0.0,0.000789,0.0,0.0,0.000789,0.0,0.0,0.0,0.0,0.0,8e-05,0.0,0.001527,0.0,0.000179,0.0,0.000611,0.0,0.000611,0.0,0.000161,0.0,0.000611,0.0,0.0,0.000789,0.000386,0.0,0.001578,0.0,0.000237,0.000305,0.000484,0.0
1,0.0,0.0,0.000175,0.0,0.0,0.000473,0.0,0.0,0.004774,0.0,0.0,0.0,0.0,0.0,0.000473,0.0,0.0,0.001542,0.0,0.0,0.0,0.000232,0.000771,0.0,0.0,0.0,0.0,0.0,0.000175,0.0,0.0,0.002139,0.000771,0.0,0.0,0.0,0.0,0.0,0.0,0.00107,...,0.0,0.0,0.0,0.0,0.0,0.00107,0.0,0.0,0.000771,0.0,0.0,0.0,0.0,0.0,0.00107,0.0,7.8e-05,0.0,0.000597,0.00107,0.000175,0.0,0.0,0.0,0.000597,0.0,7.8e-05,0.000946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000232,0.000597,0.000473,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001297,0.000844,0.0,0.0,0.0,0.0,0.001297,0.0,0.000212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001297,0.0,0.000935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001297,0.000724,0.0,0.0,0.0,9.5e-05,0.0,0.0,0.0,0.001297,0.0,0.000457,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000486,0.0,0.0,0.0,0.0,0.0,0.0,0.000486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000973,0.0,0.0,0.000567,0.000629,0.0,0.0,0.0,0.0,0.0,0.000142,0.001257,0.0,0.0,0.0,0.0,0.000872,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000385,0.0,0.0,0.0,0.0,0.0,0.0,0.000629,0.0,0.0,0.0,0.0,0.0,0.000192,0.0,0.0,0.0,0.0,0.0,0.000973,0.0,0.000243,0.000872,0.00032,0.0,0.000486,0.0,0.0,0.002514,0.0,0.000973,0.000629,0.0,0.000189,0.000243,0.000385,0.0
4,0.0,0.0,0.000334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001022,0.0,0.0,0.0,0.001355,0.0,0.000855,0.0,0.00057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00015,0.000452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.000429,0.0,0.0,0.000775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000388,0.0,0.000245,0.0,0.0,0.0,0.0,0.00019,0.0,0.000632,0.0,0.0,0.0,0.0,0.0,0.0,0.001264,0.0,0.0,0.0,0.0,0.00019,0.0,0.0,0.0,0.0,...,0.000489,0.0,0.0,0.000388,0.0,0.0,0.0,0.0,0.000632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000129,0.0,0.0,0.0,0.000143,0.0,0.0,0.0,0.000245,0.0,6.4e-05,0.0,0.0,0.0,0.0,0.0,0.000309,0.000489,0.0,0.0,0.00019,0.0,0.000775,0.000388
6,0.0,0.0,0.000156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000424,0.0,0.000267,0.0,0.0,0.0,0.0,0.000208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000416,0.000958,0.0,0.0,0.0,...,0.000535,0.0,0.000691,0.0,0.0,0.0,0.0,0.000691,0.0,0.0,0.0,0.000958,0.0,0.0,0.0,0.0,0.000211,0.0,0.000802,0.0,0.000156,0.0,0.0,0.0,0.0,0.0,0.0,0.000424,0.0,0.0,0.0,0.0,0.000338,0.0,0.0,0.0,0.0,0.000267,0.0,0.000424
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001017,0.0,0.0,0.0,0.0,0.0,0.001314,0.0,0.001017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000297,0.0,0.000657,0.0,0.0,0.0,0.0,0.000198,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000911,0.0,0.0,0.000468,0.0,0.001017,0.0,0.000149,0.0,0.0,0.0,0.0,0.0,0.000201,0.0,0.0,0.000911,0.0,0.0,0.0,0.0,0.0,0.0,0.000395,0.0,0.0,0.000403
8,0.0,0.000534,0.000156,0.0,0.0,0.000423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000957,0.0,0.0,0.0,0.0,0.000534,0.0,0.0,0.000208,0.0,0.0,0.000957,0.0,0.0,0.0,0.000156,0.0,0.0,0.0,0.00069,0.0,0.0,0.0,0.0,0.00207,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000957,0.0,0.0,0.000468,0.0,0.0,0.0,0.000267,0.0,0.00014,0.000423,0.000534,0.0,0.0,0.0,0.000337,0.0,0.0,0.0,0.000415,0.001334,0.0,0.0
9,0.001054,0.000588,0.000172,0.003162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000588,0.0,0.0,0.0,0.001054,0.000294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000172,0.0,0.0,0.0,0.0,0.0,0.0,0.000229,0.0,0.0,0.0,0.0,...,0.00294,0.0,0.0,0.0,0.0,0.0,0.00076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000155,0.0,0.000882,0.0,0.000344,0.0,0.0,0.0,0.000294,0.0,0.000232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001054,0.0,0.0,0.0,0.0


In [120]:
#function which outputs top ten important words for each chapter
def getWordMetricPairs():
  matrix=[]
  for chapter_index in range(len(chapters)):
    word_tfidf_pairs=[]
    for i in range(len(table.columns)):
      word_tfidf_pairs.append((table.columns[i],table.iloc[chapter_index][i]))
    word_tfidf_pairs.sort(reverse=True,key = lambda x: (x[1]))
    matrix.append(word_tfidf_pairs[:10])
  return matrix

In [121]:
#showing results for each chapter
important_words_matrix=getWordMetricPairs()
for i,important_words_row in enumerate(important_words_matrix):
  print(f"Chapter {i+1} : ",important_words_row)

Chapter 1 :  [('key', 0.004274259573322248), ('bat', 0.003663651062847641), ('poison', 0.0032835129190090006), ('candle', 0.0032835129190090006), ('bottle', 0.0031568002043961202), ('fall', 0.0026992684502852356), ('marked', 0.0024424340418984277), ('dinah', 0.002419478979308633), ('door', 0.0023740690267075857), ('dark', 0.00236760015329709)]
Chapter 2 :  [('swam', 0.005347776244041749), ('pool', 0.00477351826622765), ('mouse', 0.004639904975884399), ('mabel', 0.004278220995233399), ('dog', 0.0030848414286764863), ('glove', 0.0030848414286764863), ('kid', 0.0023136310715073647), ('capital', 0.0023136310715073647), ('fan', 0.0022609191776706006), ('tear', 0.0022609191776706006)]
Chapter 3 :  [('dodo', 0.011223335342071783), ('prize', 0.007782557062843449), ('mouse', 0.005908350152284528), ('lory', 0.005611667671035892), ('race', 0.005611667671035892), ('thimble', 0.005188371375228966), ('caucusrace', 0.0038912785314217245), ('cause', 0.0037411117806905947), ('dry', 0.003440778279228335

Naming:

*   Chapter 1: Burning bottle candle among the darkness
*   Chapter 2: Mabel the kid and her swimming pool
*   Chapter 3: How Lory and Dinah took part in mouse race 
*   Chapter 4: Bill's puppy behind the window
*   Chapter 5: Animal party with hookah 
*   Chapter 6: Earldom of the sounds 
*   Chapter 7: Animal marching with hatter
*   Chapter 8: Meeting queen of gardens
*   Chapter 9: Mocking turtle and moral gryphon
*   Chapter 10: Dinner party with turtle and gryphon
*   Chapter 11: The case of hatter's theft: Part one
*   Chapter 12: The case of hatter's theft: Part two




In [122]:
#obtaining an array of tuples (word, part of speech) for each word in the text
text_words=get_words(data)
text_words=nltk.tag.pos_tag(text_words, tagset='universal', lang='eng')
print(text_words[:5])

[('chapter', 'NOUN'), ('rabbithole', 'NOUN'), ('alice', 'NOUN'), ('begin', 'VERB'), ('get', 'VERB')]


In [82]:
#function which finds all verbs in a given piece of text and adds them to a given dictionary
def find_all_verbs(d,text):
  verbs=[]
  for i in range(len(text)):
    if text[i][1]=='VERB':
      verbs.append(text[i][0])
  for verb in verbs:
    if (verb not in d.keys()):
      d[verb]=0
    d[verb]+=1
  return d

In [123]:
#obtaining dictionary of verbs with the corresponding frequencies
window_size=5
alice_verbs={}
for i in range(len(text_words)):
  if text_words[i][0]=='alice':
    if (i>4 & i<len(text_words)-6):
      alice_verbs=find_all_verbs(alice_verbs,text_words[i-window_size:i+window_size])
    if (i<=4):
      alice_verbs=find_all_verbs(alice_verbs,text_words[:i+window_size])
    if (i>=len(text_words)-6):
      alice_verbs=find_all_verbs(alice_verbs,text_words[i-window_size:])

In [124]:
#representation of top-10 the most frequent verbs connected to Alice
verbs=[]
for verb,count in alice_verbs.items():
  verbs.append((verb,count))
verbs.sort(reverse=True,key = lambda x: (x[1]))
print(verbs[:10])

[('say', 225), ('go', 47), ('thought', 32), ('know', 26), ('could', 23), ('come', 23), ('begin', 21), ('get', 21), ('see', 21), ('think', 19)]
