In [1]:
#Create test documents
first = 'this is a test test test document'
second = 'this this is is is another test test document'
third = 'this this this is is third test document'

In [2]:
#Get all documents in a list
documents = [first, second, third]

In [3]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['first', 'second', 'third'])

In [4]:
#Print the Count Vectorizer dataframe
df

Unnamed: 0,another,document,is,test,third,this
first,0,1,1,3,0,1
second,1,1,3,2,0,2
third,0,1,2,1,1,3


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the Document Term Matrix 
count_tfidf = TfidfVectorizer(stop_words='english')
count_tfidf = TfidfVectorizer()
sparse_matrixTF = count_tfidf.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrixTF = sparse_matrixTF.todense()
dfTF = pd.DataFrame(doc_term_matrixTF, 
                  columns=count_tfidf.get_feature_names(), 
                  index=['first', 'second', 'third'])

In [6]:
#Print the TF-IDF Vectorizer dataframe
dfTF

Unnamed: 0,another,document,is,test,third,this
first,0.0,0.288675,0.288675,0.866025,0.0,0.288675
second,0.370653,0.218914,0.656741,0.437827,0.0,0.437827
third,0.0,0.23658,0.473159,0.23658,0.400564,0.709739


In [7]:
#A function that prints the name of the document with the highest rank matching the query word
#The function also prints the entire dataframe pertaining the query word displaying its rank in descending order

def rankWord(word):
    
    #Create a dataframe
    dfNew = pd.DataFrame(dfTF[word])
        
    #Get the entire column matching the word
    valList = dfNew[word].tolist()
    
    #Sort the values in descending order
    sortList = sorted(valList, reverse=True)

    #Create a ranklist from 1 to n
    n = len(valList)
    rankList = list(range(1,(n+1)))

    #Create a dictionary
    rankDict = dict(zip(sortList, rankList))

    #Map the dictionary to the dataframe
    dfNew['Rank'] = dfNew[word].map(rankDict)

    #Get the index of the dataframe
    print('Document matched: ', dfNew.index[dfNew["Rank"] == 1].tolist())

    #Print the entire dataframe after sorting it by rank
    print(dfNew.sort_values('Rank'))
    
    #Delete dataframe
    del dfNew

In [8]:
#Test using query word this. Third document has the highest TF-IDF
rankWord('this')

Document matched:  ['third']
            this  Rank
third   0.709739     1
second  0.437827     2
first   0.288675     3


In [9]:
#Test using query word test. First document has the highest TF-IDF
rankWord('test')

Document matched:  ['first']
            test  Rank
first   0.866025     1
second  0.437827     2
third   0.236580     3


In [10]:
#Test using query word document. First document has the highest TF-IDF though count of the word 'document' is 1 in each
rankWord('document')

Document matched:  ['first']
        document  Rank
first   0.288675     1
third   0.236580     2
second  0.218914     3


In [11]:
#Test using query word is. Second document has the highest TF-IDF
rankWord('is')

Document matched:  ['second']
              is  Rank
second  0.656741     1
third   0.473159     2
first   0.288675     3


In [12]:
# Compute Cosine Similarity based on TFIDF Vectorizer

from sklearn.metrics.pairwise import cosine_similarity

docSim = pd.DataFrame(cosine_similarity(dfTF, dfTF), columns = ['first','second','third'])
docSim.index = ['first', 'second', 'third']
docSim

Unnamed: 0,first,second,third
first,1.0,0.758339,0.614652
second,0.758339,1.0,0.776857
third,0.614652,0.776857,1.0


In [13]:
#Having understood the ideas of Count Vectorizer, TF-IDF Vectorizer and Cosine Similarity through above codes,
#let us construct TF-IDF for the corpus Shakespeare play available in nltk. Also to construct a query vector
#consisting of terms from the vocabulary and find the ranks of the play with respect to the query

In [14]:
#Libraries 
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import shakespeare
import xml.etree.ElementTree as ET

In [15]:
#Documents in shakespeare
playsXML = shakespeare.fileids()

#Store the name of plays in a list called plays by truncating file extension
plays = []
for i in range(len(playsXML)):
    filenameext = playsXML[i].split('.')
    plays.append(filenameext[0])

print("Name of plays: ", plays)

#create a list to store the path of corpus
srcXML = []
for i in playsXML:
    pf = 'corpora/shakespeare/'
    f = pf + i
    srcXML.append(f)
#print(srcXML)

#Create a list called playContents to store the corpus
playContents = []  
for i in range(len(srcXML)):
    play = ET.parse(nltk.data.find(srcXML[i]))
    root = play.getroot()
    texts = []
    for child in root:
        text = list(child.itertext())
        texts.append(text)
    textdata = []
    for i in range(4,9):
        textdata.append(texts[i])
    playContents.append(textdata)
    
print("Total number of plays in Shakespeare Corpus: ", len(playContents))

Name of plays:  ['a_and_c', 'dream', 'hamlet', 'j_caesar', 'macbeth', 'merchant', 'othello', 'r_and_j']
Total number of plays in Shakespeare Corpus:  8


In [16]:
#Preprocessing the corpus
def preprocessing(List):
    
    cor = List
    
    #Flatten the list first
    flat = [item for sublist in cor for item in sublist]
    
    #Remove \n and \n\n from the flattened list
    flat1 = [s.strip() for s in flat]
    
    #Remove empty strings if any
    flat2 = [s for s in flat1 if s != '']
    
    #Remove punctuation marks present
    flat3 = [''.join(c for c in s if c not in string.punctuation) for s in flat2]
    
    #Convert list to string
    str1 = ''
    myStr = str1.join(flat3)
     
    #Split the sentences into words
    words = []
    for i in range(len(flat2)):
        word = flat2[i].split(' ')
        words.extend(word)
    
    #Remove punctuation marks present
    words2 = [''.join(c for c in s if c not in string.punctuation) for s in words]
    
    #Remove empty strings if any
    words3 = [s for s in words2 if s != '']
        
    return words3

In [17]:
#Preprocess all the eight plays of Shakespeare corpus and store it in list called preProcList
preProcList = []
for i in range(len(playContents)):
    preProcList.append(preprocessing(playContents[i]))

In [18]:
#Create a dataframe to store number of words and vocabulary in each of the play
dfWV = pd.DataFrame(columns= ['Words','Vocab'], 
                  index=['a_and_c', 'dream', 'hamlet','j_caesar','macbeth','merchant','othello','r_and_j'])

for i in range(len(preProcList)):
    dfWV.Words[i] = len(preProcList[i])
    dfWV.Vocab[i] = len(set(preProcList[i]))
    
dfWV

Unnamed: 0,Words,Vocab
a_and_c,26760,4588
dream,17024,3388
hamlet,31990,5307
j_caesar,20705,3331
macbeth,18067,3832
merchant,22061,3647
othello,27724,4309
r_and_j,21971,3815


In [19]:
#Name the respective corpus and store its contents
a_and_c = preProcList[0]
dream = preProcList[1]
hamlet = preProcList[2]
j_caesar = preProcList[3]
macbeth = preProcList[4]
merchant = preProcList[5]
othello = preProcList[6]
r_and_j = preProcList[7]

#Print first 100 words in a_and_c play
print(a_and_c[:100]) #Contains single word

['ACT', 'I', 'SCENE', 'I', 'Alexandria', 'A', 'room', 'in', 'CLEOPATRAs', 'palace', 'Enter', 'DEMETRIUS', 'and', 'PHILO', 'PHILO', 'Nay', 'but', 'this', 'dotage', 'of', 'our', 'generals', 'Oerflows', 'the', 'measure', 'those', 'his', 'goodly', 'eyes', 'That', 'oer', 'the', 'files', 'and', 'musters', 'of', 'the', 'war', 'Have', 'glowd', 'like', 'plated', 'Mars', 'now', 'bend', 'now', 'turn', 'The', 'office', 'and', 'devotion', 'of', 'their', 'view', 'Upon', 'a', 'tawny', 'front', 'his', 'captains', 'heart', 'Which', 'in', 'the', 'scuffles', 'of', 'great', 'fights', 'hath', 'burst', 'The', 'buckles', 'on', 'his', 'breast', 'reneges', 'all', 'temper', 'And', 'is', 'become', 'the', 'bellows', 'and', 'the', 'fan', 'To', 'cool', 'a', 'gipsys', 'lust', 'Flourish', 'Enter', 'ANTONY', 'CLEOPATRA', 'her', 'Ladies\nthe', 'Train', 'with', 'Eunuchs']


In [20]:
# Function to convert string of elements in a list to a single element 
def listToString(s):  
    # initialize an empty string 
    str1 = " " 
    # return string   
    return (str1.join(s))

In [21]:
#Convert into a string
acStr = listToString(a_and_c)
drStr = listToString(dream)
haStr = listToString(hamlet)
jcStr = listToString(j_caesar)
maStr = listToString(macbeth)
meStr = listToString(merchant)
otStr = listToString(othello)
rjStr = listToString(r_and_j)

#First 100 strings
acStr[:100]

'ACT I SCENE I Alexandria A room in CLEOPATRAs palace Enter DEMETRIUS and PHILO PHILO Nay but this do'

In [22]:
#Create a playlist containing all plays of Shakespeare in the desired format
documents = [acStr, drStr, haStr, jcStr, maStr, meStr, otStr, rjStr]

In [23]:
# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(documents)

# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense()
dfTFShksp = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['a_and_c', 'dream', 'hamlet','j_caesar','macbeth','merchant','othello','r_and_j'])

In [24]:
dfTFShksp

Unnamed: 0,abandon,abate,abatements,abbey,abed,abhor,abhorred,abhorring,abhors,abide,...,yourself,yourselves,youth,youthful,youths,youve,youwhy,zeal,zone,zounds
a_and_c,0,0,0,0,0,0,0,1,0,2,...,15,1,5,0,0,2,0,0,0,0
dream,0,1,0,0,0,0,0,0,0,2,...,3,3,7,0,0,0,0,0,0,0
hamlet,0,1,1,0,0,0,1,0,0,0,...,15,1,16,0,0,0,1,0,1,0
j_caesar,0,0,0,0,0,0,0,0,0,2,...,12,6,0,1,1,1,0,0,0,0
macbeth,0,0,0,0,1,0,1,0,0,2,...,2,3,1,0,1,0,0,0,0,0
merchant,0,1,0,0,0,0,0,0,0,2,...,4,0,8,1,0,0,0,1,0,0
othello,1,0,0,0,1,3,0,0,0,0,...,16,0,5,0,0,0,0,0,0,3
r_and_j,0,1,0,1,1,0,0,0,1,0,...,5,0,2,3,0,0,0,0,0,2


In [25]:
# Create the Document Term Matrix using TfidVectorizer
tfid_vectorizer = TfidfVectorizer(stop_words='english')
tfid_vectorizer = TfidfVectorizer()
sparse_matrixTF = tfid_vectorizer.fit_transform(documents)

doc_term_matrixTF = sparse_matrixTF.todense()
dfTFIDFShksp = pd.DataFrame(doc_term_matrixTF, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['a_and_c', 'dream', 'hamlet', 'j_caesar','macbeth','merchant','othello','r_and_j'])

In [26]:
dfTFIDFShksp

Unnamed: 0,abandon,abate,abatements,abbey,abed,abhor,abhorred,abhorring,abhors,abide,...,yourself,yourselves,youth,youthful,youths,youve,youwhy,zeal,zone,zounds
a_and_c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001132,0.0,0.00127,...,0.006778,0.000635,0.002526,0.0,0.0,0.001897,0.0,0.0,0.0,0.0
dream,0.0,0.001151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002037,...,0.002174,0.003056,0.00567,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hamlet,0.0,0.000559,0.000882,0.0,0.0,0.0,0.000739,0.0,0.0,0.0,...,0.005281,0.000495,0.006297,0.0,0.0,0.0,0.000882,0.0,0.000882,0.0
j_caesar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001577,...,0.006733,0.004731,0.0,0.001016,0.001177,0.001177,0.0,0.0,0.0,0.0
macbeth,0.0,0.0,0.0,0.0,0.001143,0.0,0.001324,0.0,0.0,0.001774,...,0.001262,0.002661,0.000705,0.0,0.001324,0.0,0.0,0.0,0.0,0.0
merchant,0.0,0.000859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001521,...,0.002164,0.0,0.004838,0.00098,0.0,0.0,0.0,0.001355,0.0,0.0
othello,0.000994,0.0,0.0,0.0,0.000719,0.002981,0.0,0.0,0.0,0.0,...,0.00635,0.0,0.002218,0.0,0.0,0.0,0.0,0.0,0.0,0.002499
r_and_j,0.0,0.000885,0.0,0.001395,0.001009,0.0,0.0,0.0,0.001395,0.0,...,0.002786,0.0,0.001246,0.003028,0.0,0.0,0.0,0.0,0.0,0.002339


In [27]:
#A function that prints the name of the document with the highest rank matching the query word
#The function also prints the entire dataframe pertaining the query word displaying its rank in descending order

def rankWordShksp(word):
    
    #Create a dataframe
    dfNew = pd.DataFrame(dfTFIDFShksp[word])
        
    #Get the entire column matching the word
    valList = dfNew[word].tolist()
    
    #Sort the values in descending order
    sortList = sorted(valList, reverse=True)

    #Create a ranklist from 1 to n
    n = len(valList)
    rankList = list(range(1,(n+1)))

    #Create a dictionary
    rankDict = dict(zip(sortList, rankList))

    #Map the dictionary to the dataframe
    dfNew['Rank'] = dfNew[word].map(rankDict)

    #Get the index of the dataframe
    print('Document matched: ', dfNew.index[dfNew["Rank"] == 1].tolist())

    #Print the entire dataframe after sorting it by rank
    print(dfNew.sort_values('Rank'))
    
    #Delete dataframe
    del dfNew

In [28]:
#Test words
rankWordShksp('caesar')

Document matched:  ['j_caesar']
            caesar  Rank
j_caesar  0.181374     1
a_and_c   0.161321     2
hamlet    0.000990     3
macbeth   0.000887     4
othello   0.000558     5
dream     0.000000     8
merchant  0.000000     8
r_and_j   0.000000     8


In [29]:
#Test words
rankWordShksp('shylock')

Document matched:  ['merchant']
          shylock  Rank
merchant  0.13953     1
a_and_c   0.00000     8
dream     0.00000     8
hamlet    0.00000     8
j_caesar  0.00000     8
macbeth   0.00000     8
othello   0.00000     8
r_and_j   0.00000     8


In [30]:
#Test words
rankWordShksp('claudius')

Document matched:  ['hamlet']
          claudius  Rank
hamlet    0.088663     1
j_caesar  0.012952     2
a_and_c   0.000000     8
dream     0.000000     8
macbeth   0.000000     8
merchant  0.000000     8
othello   0.000000     8
r_and_j   0.000000     8


In [31]:
#Test words
rankWordShksp('desdemona')

Document matched:  ['othello']
          desdemona  Rank
othello    0.222617     1
a_and_c    0.000000     8
dream      0.000000     8
hamlet     0.000000     8
j_caesar   0.000000     8
macbeth    0.000000     8
merchant   0.000000     8
r_and_j    0.000000     8


In [32]:
#Test words
rankWordShksp('duncan')

Document matched:  ['macbeth']
            duncan  Rank
macbeth   0.053722     1
a_and_c   0.000000     8
dream     0.000000     8
hamlet    0.000000     8
j_caesar  0.000000     8
merchant  0.000000     8
othello   0.000000     8
r_and_j   0.000000     8


In [33]:
#Test words
rankWordShksp('laurence')

Document matched:  ['r_and_j']
          laurence  Rank
r_and_j   0.075356     1
a_and_c   0.000000     8
dream     0.000000     8
hamlet    0.000000     8
j_caesar  0.000000     8
macbeth   0.000000     8
merchant  0.000000     8
othello   0.000000     8


In [34]:
#Test words
rankWordShksp('yourself')

Document matched:  ['a_and_c']
          yourself  Rank
a_and_c   0.006778     1
j_caesar  0.006733     2
othello   0.006350     3
hamlet    0.005281     4
r_and_j   0.002786     5
dream     0.002174     6
merchant  0.002164     7
macbeth   0.001262     8


In [35]:
# Compute Cosine Similarity based on TFIDF Vectorizer

from sklearn.metrics.pairwise import cosine_similarity

docSimTF = pd.DataFrame(cosine_similarity(dfTFIDFShksp, dfTFIDFShksp), columns = plays)
docSimTF.index = plays
docSimTF

Unnamed: 0,a_and_c,dream,hamlet,j_caesar,macbeth,merchant,othello,r_and_j
a_and_c,1.0,0.725026,0.699488,0.762043,0.687564,0.740421,0.632172,0.666015
dream,0.725026,1.0,0.759767,0.742366,0.741762,0.811359,0.692628,0.736766
hamlet,0.699488,0.759767,1.0,0.711692,0.717888,0.780523,0.666305,0.690708
j_caesar,0.762043,0.742366,0.711692,1.0,0.691789,0.760874,0.64603,0.675611
macbeth,0.687564,0.741762,0.717888,0.691789,1.0,0.754652,0.636178,0.673152
merchant,0.740421,0.811359,0.780523,0.760874,0.754652,1.0,0.709993,0.738573
othello,0.632172,0.692628,0.666305,0.64603,0.636178,0.709993,1.0,0.638066
r_and_j,0.666015,0.736766,0.690708,0.675611,0.673152,0.738573,0.638066,1.0


In [None]:
#Macbeth (0.687) is least similar to Anthony and Cleopatra
#Othello (0.692) is least similar to Midsummer's Night Dream
#Othello (0.666) is least similar to Hamlet
#
#Of all the plays, Othello is least similar to other plays of shakespeare