In [2]:
import pandas as pd


In [3]:
df = pd.read_csv('data.csv')

In [4]:
print(df)

               Words               Author
0     W1 W2 W3 W4 W5  Christopher Marlowe
1        W1 W1 W4 W3  Christopher Marlowe
2           W1 W2 W5  Christopher Marlowe
3     W5 W6 W1 W2 W3      William Stanley
4           W4 W5 W6      William Stanley
5           W4 W6 W3        Francis Bacon
6  W2 W2 W4 W3 W5 W5        Francis Bacon


In [5]:
# Term-Document Matrix (TDM) for ‘Christopher Marlowe’ class
from sklearn.feature_extraction.text import CountVectorizer

c_docs = [row['Words'] for index,row in df.iterrows() if row['Author'] == 'Christopher Marlowe']

vec_c = CountVectorizer()
X_C = vec_c.fit_transform(c_docs)
tdm_C = pd.DataFrame(X_C.toarray(), columns=vec_c.get_feature_names())

tdm_C

Unnamed: 0,w1,w2,w3,w4,w5
0,1,1,1,1,1
1,2,0,1,1,0
2,1,1,0,0,1


In [6]:
# Term-Document Matrix (TDM) for ‘William Stanley’ class
w_docs = [row['Words'] for index,row in df.iterrows() if row['Author'] == 'William Stanley']

vec_w = CountVectorizer()
X_w = vec_w.fit_transform(w_docs)
tdm_w = pd.DataFrame(X_w.toarray(), columns=vec_w.get_feature_names())

tdm_w


Unnamed: 0,w1,w2,w3,w4,w5,w6
0,1,1,1,0,1,1
1,0,0,0,1,1,1


In [7]:
# Term-Document Matrix (TDM) for ‘Francis Bacon’ class
f_docs = [row['Words'] for index,row in df.iterrows() if row['Author'] == 'Francis Bacon']

vec_f = CountVectorizer()
X_f = vec_f.fit_transform(f_docs)
tdm_f = pd.DataFrame(X_f.toarray(), columns=vec_f.get_feature_names())

tdm_f

Unnamed: 0,w2,w3,w4,w5,w6
0,0,1,1,0,1
1,2,1,1,2,0


In [8]:
# find frequency for each words in C
word_list_c = vec_c.get_feature_names();    
count_list_c = X_C.toarray().sum(axis=0) 
freq_c = dict(zip(word_list_c,count_list_c))
freq_c

{'w1': 4, 'w2': 2, 'w3': 2, 'w4': 2, 'w5': 2}

In [9]:
# find frequency for each words in W
word_list_w = vec_w.get_feature_names();    
count_list_w = X_w.toarray().sum(axis=0) 
freq_w = dict(zip(word_list_w,count_list_w))
freq_w

{'w1': 1, 'w2': 1, 'w3': 1, 'w4': 1, 'w5': 2, 'w6': 2}

In [10]:
# find frequency for each words in F
word_list_f = vec_f.get_feature_names();    
count_list_f = X_f.toarray().sum(axis=0) 
freq_f = dict(zip(word_list_f,count_list_f))
freq_f

{'w2': 2, 'w3': 2, 'w4': 2, 'w5': 2, 'w6': 1}

In [11]:
# find the probabilities of words in C class
prob_c = []
for word, count in zip(word_list_c, count_list_c):
    prob_c.append(count / len(word_list_c))
dict(zip(word_list_c, prob_c))

{'w1': 0.8, 'w2': 0.4, 'w3': 0.4, 'w4': 0.4, 'w5': 0.4}

In [12]:
# find the probabilities of words in W class
prob_w = []
for word, count in zip(word_list_w, count_list_w):
    prob_w.append(count / len(word_list_w))
dict(zip(word_list_w, prob_w))

{'w1': 0.16666666666666666,
 'w2': 0.16666666666666666,
 'w3': 0.16666666666666666,
 'w4': 0.16666666666666666,
 'w5': 0.3333333333333333,
 'w6': 0.3333333333333333}

In [13]:
# find the probabilities of words in F class
prob_f = []
for word, count in zip(word_list_f, count_list_f):
    prob_f.append(count / len(word_list_f))
dict(zip(word_list_f, prob_f))

{'w2': 0.4, 'w3': 0.4, 'w4': 0.4, 'w5': 0.4, 'w6': 0.2}

In [14]:
# find the total different words
from sklearn.feature_extraction.text import CountVectorizer

docs = [row['Words'] for index,row in df.iterrows()]

vec = CountVectorizer()
X = vec.fit_transform(docs)

total_features = len(vec.get_feature_names())
total_features

6

In [15]:
total_cnts_features_c = count_list_c.sum(axis=0)
total_cnts_features_w = count_list_w.sum(axis=0)
total_cnts_features_f = count_list_f.sum(axis=0)

In [16]:
from nltk.tokenize import word_tokenize
new_sentence = 'w1 w4 w6 w5 w3'
new_word_list = word_tokenize(new_sentence)

In [17]:
prob_s_with_ls = []

for word in new_word_list:
    if word in freq_c.keys():
        count = freq_c[word]
    else:
        count = 0
    prob_s_with_ls.append((count + 1)/(total_cnts_features_c + total_features))
dict_c = dict(zip(new_word_list,prob_s_with_ls))
dict_c

{'w1': 0.2777777777777778,
 'w4': 0.16666666666666666,
 'w6': 0.05555555555555555,
 'w5': 0.16666666666666666,
 'w3': 0.16666666666666666}

In [18]:
prob_s_with_lc = []
for word in new_word_list:
    if word in freq_w.keys():
        count = freq_w[word]
    else:
        count = 0
    prob_s_with_lc.append((count + 1)/(total_cnts_features_w + total_features))
dict_w=dict(zip(new_word_list,prob_s_with_lc))
dict_w

{'w1': 0.14285714285714285,
 'w4': 0.14285714285714285,
 'w6': 0.21428571428571427,
 'w5': 0.21428571428571427,
 'w3': 0.14285714285714285}

In [19]:
prob_s_with_lf = []
for word in new_word_list:
    if word in freq_f.keys():
        count = freq_f[word]
    else:
        count = 0
    prob_s_with_lf.append((count + 1)/(total_cnts_features_f + total_features))
dict_f=dict(zip(new_word_list,prob_s_with_lf))
dict_f

{'w1': 0.06666666666666667,
 'w4': 0.2,
 'w6': 0.13333333333333333,
 'w5': 0.2,
 'w3': 0.2}

In [28]:
test_prob_is_c = 1
for i in dict_c:
    test_prob_is_c = dict_c[i] * test_prob_is_c
test_prob_is_c *= 3/7

In [26]:
test_prob_is_w = 1
for i in dict_w:
    test_prob_is_w = dict_w[i] * test_prob_is_w
test_prob_is_w *= 2/7

In [29]:
test_prob_is_f = 1
for i in dict_f:
    test_prob_is_f = dict_f[i] * test_prob_is_f
test_prob_is_f *= 2/7

In [30]:
if(test_prob_is_c >= test_prob_is_w and test_prob_is_c >= test_prob_is_f): 
    print('the test document belongs to Christopher Marlowe') 
  
elif(test_prob_is_w >= test_prob_is_c and test_prob_is_w >= test_prob_is_f): 
    print('the test document belongs to William Stanley') 
else: 
    print('the test document belongs to Francis Bacon') 

the test document belongs to William Stanley
