In [2]:
import csv
import numpy as np
from sklearn.pipeline import Pipeline
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob, Word, Blobber
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#stopwords

stopword=[]
with open('english-stop-words-large.txt','r') as file:
    files=file.read()
file.close()
stopword=TextBlob(files)

In [4]:
#book content handling (without stopwords)

tknzr = TweetTokenizer()
book_list=['puck-of-pooks-hill.txt','man-who-would-be-king.txt','kim.txt','just-so-stories.txt',\
           'jungle-book.txt','ginger-pickles.txt','jeremy-fisher.txt','squirrel-nutkin.txt',\
           'benjamin-bunny.txt','peter-rabbit.txt']
bk_list={}
for i in book_list:
    b_k=[]
    with open(i) as book:
        book=csv.reader(book)
        bk=list(book)
        for j in bk:
            for k in j:
                word=tknzr.tokenize(k.strip())
                for l in word:
                    if l not in stopword:
                        b_k.append(l)
    bk_list[i]=b_k

In [5]:
#p1-a
ploarity_dict={}
for i in book_list:
    with open(i,'r') as file:
        files=file.read()
    file.close()
    blob=TextBlob(files)
    ploarity_dict[i]=round(blob.sentiment.polarity,4)
ploarity_dict

{'puck-of-pooks-hill.txt': 0.09,
 'man-who-would-be-king.txt': 0.0688,
 'kim.txt': 0.0816,
 'just-so-stories.txt': 0.1552,
 'jungle-book.txt': 0.0404,
 'ginger-pickles.txt': 0.0857,
 'jeremy-fisher.txt': 0.0901,
 'squirrel-nutkin.txt': 0.0862,
 'benjamin-bunny.txt': 0.0864,
 'peter-rabbit.txt': 0.0889}

In [6]:
#p1-b
subjectivity_dict={}
for i in book_list:
    with open(i,'r') as file:
        files=file.read()
    file.close()
    blob=TextBlob(files)
    subjectivity_dict[i]=round(blob.sentiment.subjectivity,4)
subjectivity_dict

{'puck-of-pooks-hill.txt': 0.4584,
 'man-who-would-be-king.txt': 0.4284,
 'kim.txt': 0.4719,
 'just-so-stories.txt': 0.4898,
 'jungle-book.txt': 0.4516,
 'ginger-pickles.txt': 0.4232,
 'jeremy-fisher.txt': 0.4147,
 'squirrel-nutkin.txt': 0.3723,
 'benjamin-bunny.txt': 0.3875,
 'peter-rabbit.txt': 0.4015}

In [7]:
#p1-c word count 
word_ct_dict={}
for i in bk_list.keys():
    word_ct_dict[i]=len(bk_list[i])

word_ct_dict

{'puck-of-pooks-hill.txt': 28394,
 'man-who-would-be-king.txt': 8067,
 'kim.txt': 50776,
 'just-so-stories.txt': 14903,
 'jungle-book.txt': 24682,
 'ginger-pickles.txt': 2470,
 'jeremy-fisher.txt': 2229,
 'squirrel-nutkin.txt': 2542,
 'benjamin-bunny.txt': 2402,
 'peter-rabbit.txt': 2314}

In [8]:
#p1-def most freq words & normalised

freq_dict={}
normalised_dict={}
tf_dict={}

for i in bk_list.keys():
    TOP_MOST =1
    words = {}
    with open(i,'r') as file:
        files=file.read()
    file.close()
    blob=TextBlob(files)
    for w in blob.word_counts:
        if (w not in stopword and (w!= '‘') and (w != '”')):
            words[w] = blob.word_counts[w]
    
    sorted_words = sorted( words, key=words.__getitem__, reverse=True )
    for ( j, w ) in zip( range( TOP_MOST ), sorted_words ):
        freq_dict[i]=w
        normalised_dict[i]=round(blob.word_counts[w]/word_ct_dict[i],5)
        tf_dict[i]=round(blob.word_counts[w]/len(sorted_words),5)
    
freq_dict

{'puck-of-pooks-hill.txt': 'men',
 'man-who-would-be-king.txt': 'dravot',
 'kim.txt': 'kim',
 'just-so-stories.txt': 'wild',
 'jungle-book.txt': 'mowgli',
 'ginger-pickles.txt': 'project',
 'jeremy-fisher.txt': 'project',
 'squirrel-nutkin.txt': 'project',
 'benjamin-bunny.txt': 'project',
 'peter-rabbit.txt': 'project'}

In [9]:
normalised_dict

{'puck-of-pooks-hill.txt': 0.00606,
 'man-who-would-be-king.txt': 0.01103,
 'kim.txt': 0.01743,
 'just-so-stories.txt': 0.00698,
 'jungle-book.txt': 0.00891,
 'ginger-pickles.txt': 0.03482,
 'jeremy-fisher.txt': 0.03858,
 'squirrel-nutkin.txt': 0.03423,
 'benjamin-bunny.txt': 0.03622,
 'peter-rabbit.txt': 0.0376}

In [10]:
tf_dict

{'puck-of-pooks-hill.txt': 0.02904,
 'man-who-would-be-king.txt': 0.03381,
 'kim.txt': 0.08931,
 'just-so-stories.txt': 0.03408,
 'jungle-book.txt': 0.04647,
 'ginger-pickles.txt': 0.10424,
 'jeremy-fisher.txt': 0.1114,
 'squirrel-nutkin.txt': 0.10284,
 'benjamin-bunny.txt': 0.11111,
 'peter-rabbit.txt': 0.11027}

In [11]:
#p1-g
#corpus
corpus=[]
for i in book_list:
    with open(i,'r') as file:
        files=file.read()
        corpus.append(files)
    file.close()

#vocabulary
vocabulary=[]
for i in freq_dict:
    vocabulary.append(freq_dict[i])
vocabulary=list(set(vocabulary))
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
idf=list(pipe['tfid'].idf_)
voc=pipe['count'].vocabulary

#idf_dictionary
sum_idf={}
for i in range(len(voc)):
    sum_idf[voc[i]]=idf[i]

#idf with files
idf_dict={}
for i in freq_dict:
    idf_dict[i]=round(sum_idf.get(freq_dict[i]),5)
idf_dict

{'puck-of-pooks-hill.txt': 1.31845,
 'man-who-would-be-king.txt': 2.70475,
 'kim.txt': 2.70475,
 'just-so-stories.txt': 1.45199,
 'jungle-book.txt': 2.70475,
 'ginger-pickles.txt': 1.0,
 'jeremy-fisher.txt': 1.0,
 'squirrel-nutkin.txt': 1.0,
 'benjamin-bunny.txt': 1.0,
 'peter-rabbit.txt': 1.0}

In [12]:
#p1-h
tf_idf_dict={}
for i in tf_dict:
    tf_idf=tf_dict[i]*idf_dict[i]
    tf_idf_dict[i]=round(tf_idf,5)
tf_idf_dict

{'puck-of-pooks-hill.txt': 0.03829,
 'man-who-would-be-king.txt': 0.09145,
 'kim.txt': 0.24156,
 'just-so-stories.txt': 0.04948,
 'jungle-book.txt': 0.12569,
 'ginger-pickles.txt': 0.10424,
 'jeremy-fisher.txt': 0.1114,
 'squirrel-nutkin.txt': 0.10284,
 'benjamin-bunny.txt': 0.11111,
 'peter-rabbit.txt': 0.11027}

In [13]:
#summarise
import csv
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob, Word, Blobber
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

#Stopwords
stopword=[]
with open('english-stop-words-large.txt','r') as file:
    files=file.read()
file.close()
stopword=TextBlob(files)

#Book content handling (without stopwords)
tknzr = TweetTokenizer()
book_list=['puck-of-pooks-hill.txt','man-who-would-be-king.txt','kim.txt','just-so-stories.txt',\
           'jungle-book.txt','ginger-pickles.txt','jeremy-fisher.txt','squirrel-nutkin.txt',\
           'benjamin-bunny.txt','peter-rabbit.txt']
bk_list={}
for i in book_list:
    b_k=[]
    with open(i) as book:
        book=csv.reader(book)
        bk=list(book)
        for j in bk:
            for k in j:
                word=tknzr.tokenize(k.strip())
                for l in word:
                    if l not in stopword:
                        b_k.append(l)
    bk_list[i]=b_k


#Computinging statistics for each book
ploarity_dict={}
subjectivity_dict={}
word_ct_dict={}
freq_dict={}
normalised_dict={}
tf_dict={}

for i in book_list:
    with open(i,'r') as file:
        files=file.read()
    file.close()
    blob=TextBlob(files)
    #(a) ploar
    ploarity_dict[i]=round(blob.sentiment.polarity,5)
    #(b) subjectivity
    subjectivity_dict[i]=round(blob.sentiment.subjectivity,5)
    #(c) word count 
    word_ct_dict[i]=len(bk_list[i])
    #(d) most frequent term (word)
    #(e) normalised frequency of most frequent word
    #(f) term frequency
    TOP_MOST =1
    words = {}
    for w in blob.word_counts:
        if (w not in stopword and (w!= '‘') and (w != '”')):
            words[w] = blob.word_counts[w]
    sorted_words = sorted( words, key=words.__getitem__, reverse=True )
    for ( j, w ) in zip( range( TOP_MOST ), sorted_words ):
        freq_dict[i]=w
        normalised_dict[i]=round(blob.word_counts[w]/word_ct_dict[i],5)
        tf_dict[i]=round(blob.word_counts[w]/len(sorted_words),5)

#(g) inverse document frequency
#corpus
corpus=[]
for i in book_list:
    with open(i,'r') as file:
        files=file.read()
        corpus.append(files)
    file.close()

#vocabulary
vocabulary=[]
for i in freq_dict:
    vocabulary.append(freq_dict[i])
vocabulary=list(set(vocabulary))
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)
idf=list(pipe['tfid'].idf_)
voc=pipe['count'].vocabulary

#idf_dictionary
sum_idf={}
for i in range(len(voc)):
    sum_idf[voc[i]]=idf[i]

#idf with files
idf_dict={}
for i in freq_dict:
    idf_dict[i]=round(sum_idf.get(freq_dict[i]),5)   
    
#(h) TF-IDF
tf_idf_dict={}
for i in tf_dict:
    tf_idf=tf_dict[i]*idf_dict[i]
    tf_idf_dict[i]=round(tf_idf,5)
    
#Table
table_data={}
for i in book_list:
    data=[ploarity_dict[i],subjectivity_dict[i],word_ct_dict[i],freq_dict[i],normalised_dict[i],tf_dict[i],idf_dict[i],tf_idf_dict[i]]
    table_data[i]=data
index_name=['Polarity','Subjectivity','Word Count','Most Frequent Word','Normalised Frequency','TF','IDF','TF-IDF']
df=pd.DataFrame(table_data,index=index_name)

In [14]:
df

Unnamed: 0,puck-of-pooks-hill.txt,man-who-would-be-king.txt,kim.txt,just-so-stories.txt,jungle-book.txt,ginger-pickles.txt,jeremy-fisher.txt,squirrel-nutkin.txt,benjamin-bunny.txt,peter-rabbit.txt
Polarity,0.08995,0.06882,0.08163,0.15515,0.04044,0.08572,0.09007,0.08617,0.08636,0.08891
Subjectivity,0.45835,0.42837,0.47186,0.48981,0.45159,0.42321,0.41469,0.37232,0.38747,0.40154
Word Count,28394,8067,50776,14903,24682,2470,2229,2542,2402,2314
Most Frequent Word,men,dravot,kim,wild,mowgli,project,project,project,project,project
Normalised Frequency,0.00606,0.01103,0.01743,0.00698,0.00891,0.03482,0.03858,0.03423,0.03622,0.0376
TF,0.02904,0.03381,0.08931,0.03408,0.04647,0.10424,0.1114,0.10284,0.11111,0.11027
IDF,1.31845,2.70475,2.70475,1.45199,2.70475,1,1,1,1,1
TF-IDF,0.03829,0.09145,0.24156,0.04948,0.12569,0.10424,0.1114,0.10284,0.11111,0.11027


In [15]:
df.to_csv()

',puck-of-pooks-hill.txt,man-who-would-be-king.txt,kim.txt,just-so-stories.txt,jungle-book.txt,ginger-pickles.txt,jeremy-fisher.txt,squirrel-nutkin.txt,benjamin-bunny.txt,peter-rabbit.txt\nPolarity,0.08995,0.06882,0.08163,0.15515,0.04044,0.08572,0.09007,0.08617,0.08636,0.08891\nSubjectivity,0.45835,0.42837,0.47186,0.48981,0.45159,0.42321,0.41469,0.37232,0.38747,0.40154\nWord Count,28394,8067,50776,14903,24682,2470,2229,2542,2402,2314\nMost Frequent Word,men,dravot,kim,wild,mowgli,project,project,project,project,project\nNormalised Frequency,0.00606,0.01103,0.01743,0.00698,0.00891,0.03482,0.03858,0.03423,0.03622,0.0376\nTF,0.02904,0.03381,0.08931,0.03408,0.04647,0.10424,0.1114,0.10284,0.11111,0.11027\nIDF,1.31845,2.70475,2.70475,1.45199,2.70475,1.0,1.0,1.0,1.0,1.0\nTF-IDF,0.03829,0.09145,0.24156,0.04948,0.12569,0.10424,0.1114,0.10284,0.11111,0.11027\n'

In [38]:
pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(corpus)

In [37]:
#pipe=text_clf
X_list=[]
for i in index_name:
    X_list.append(df.loc[i])
X=np.array(X_list)

book_list=['puck-of-pooks-hill.txt','man-who-would-be-king.txt','kim.txt','just-so-stories.txt',\
           'jungle-book.txt','ginger-pickles.txt','jeremy-fisher.txt','squirrel-nutkin.txt',\
           'benjamin-bunny.txt','peter-rabbit.txt']
y_list=['Rudyard Kipling','Rudyard Kipling','Rudyard Kipling','Rudyard Kipling','Rudyard Kipling',\
        'Beatrix Potter','Beatrix Potter','Beatrix Potter','Beatrix Potter','Beatrix Potter']
y=np.array(y_list).reshape((10, 1))
pipe= pipe.fit(X, y)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [20]:
df1=pd.DataFrame(table_data)
df2=df1.stack()

In [22]:
table_data

{'puck-of-pooks-hill.txt': [0.08995,
  0.45835,
  28394,
  'men',
  0.00606,
  0.02904,
  1.31845,
  0.03829],
 'man-who-would-be-king.txt': [0.06882,
  0.42837,
  8067,
  'dravot',
  0.01103,
  0.03381,
  2.70475,
  0.09145],
 'kim.txt': [0.08163,
  0.47186,
  50776,
  'kim',
  0.01743,
  0.08931,
  2.70475,
  0.24156],
 'just-so-stories.txt': [0.15515,
  0.48981,
  14903,
  'wild',
  0.00698,
  0.03408,
  1.45199,
  0.04948],
 'jungle-book.txt': [0.04044,
  0.45159,
  24682,
  'mowgli',
  0.00891,
  0.04647,
  2.70475,
  0.12569],
 'ginger-pickles.txt': [0.08572,
  0.42321,
  2470,
  'project',
  0.03482,
  0.10424,
  1.0,
  0.10424],
 'jeremy-fisher.txt': [0.09007,
  0.41469,
  2229,
  'project',
  0.03858,
  0.1114,
  1.0,
  0.1114],
 'squirrel-nutkin.txt': [0.08617,
  0.37232,
  2542,
  'project',
  0.03423,
  0.10284,
  1.0,
  0.10284],
 'benjamin-bunny.txt': [0.08636,
  0.38747,
  2402,
  'project',
  0.03622,
  0.11111,
  1.0,
  0.11111],
 'peter-rabbit.txt': [0.08891,
  0.4015

In [39]:
import cv2

ModuleNotFoundError: No module named 'cv2'