In [306]:
import numpy as np
import math
import random
from copy import copy, deepcopy
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup as bs
from numpy.linalg import norm

In [307]:
inputs = []
with open('Data/topics.txt','r',encoding='utf-8') as file:
    for line in file:
        line = line[:-1]
        inputs.append(line)
    print(inputs)

['Coffee', 'Arduino', 'Anime', 'Astronomy', 'Biology', 'Chess', 'Cooking', 'Law', 'Space', 'Windows_Phone', 'Word_Working']


In [102]:
def Text_Preprocessing(input_type):
    input_file = "Data/Training/"+ input_type + ".xml"
    array = []
    with open(input_file,'r',encoding='utf-8') as file:
        content = file.read()
        soup = bs(content)
        for items in soup.findAll("row"):
            Document = dict()
            body = items['body']

            # removing tags and numbers using regex        
#             body = re.sub(r'<[^>]*>','', body)
            body = re.sub(r'[-+]?\d+', '', body)
            body = re.sub(r'[^\x00-\x7F]', ' ', body)

            #Lowercase the text
            body = body.lower()

            #Remove punctuations
            body = body.translate((str.maketrans('','',string.punctuation)))

            #Tokenize
            body = word_tokenize(body)

            #Remove stopwords
            stop_words = set(stopwords.words('english'))
            body = [word for word in body if not word in stop_words]

            #Lemmatize tokens
            lemmatizer = WordNetLemmatizer()
            body = [lemmatizer.lemmatize(word) for word in body]

            #Stemming tokens
            stemmer= PorterStemmer()
            body = [stemmer.stem(word) for word in body]
            
            if not body:
                continue
            
            Document[input_type] = body
            
            array.append(Document)
            #print(body, "\n\n\n")
        #print(array)
        return array

In [103]:
Training_set = []
Validation_set = []
Test_set = []
for input_file in inputs:
    array = Text_Preprocessing(input_file)
    Training_set += array[:500]
    Validation_set += array[500:500+200]
    Test_set += array[700:1200]

In [104]:
print(len(Training_set))
print(len(Validation_set))
print(len(Test_set))
# print(Training_set)

1500
600
1500


In [105]:
FeatureSpace = []
for document in Training_set:
    temp = list(document.values())
    for i in temp[0]:
        if i not in FeatureSpace:
            FeatureSpace.append(i)
print(len(FeatureSpace))
# print(FeatureSpace)

17306


In [106]:
Y_train = []
X_train_HD = []
X_train_ED = []

X_Validation_HD = []
X_Validation_ED = []
Y_Validation =[]

X_test_HD = []
X_test_ED = []
Y_test = []

for i in range(len(Training_set)):
    Y_train.append(list(Training_set[i].keys())[0])
    temp = list(Training_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_train_HD.append(HD)
    X_train_ED.append(ED)

for i in range(len(Test_set)):
    Y_test.append(list(Test_set[i].keys())[0])
    temp = list(Test_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_test_HD.append(HD)
    X_test_ED.append(ED)
    
    
for i in range(len(Validation_set)):
    Y_Validation.append(list(Validation_set[i].keys())[0])
    temp = list(Validation_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_Validation_HD.append(HD)
    X_Validation_ED.append(ED)

In [12]:
print(len(X_Validation_HD[0]))

17306


In [13]:
def getDistance(Dt,D1):
    count = 0
    for i in range(len(D1)):
        if D1[i] != Dt[i]:
            count += 1
    return count

In [51]:
def HammingDistance(Dt, X_train, Y_train, k = 3):
    Hamming_D = []
    for i in range(len(X_train)):
        Hamming_D.append(getDistance(Dt, X_train[i]))
        
#     print(Hamming_D)
    
    indices = sorted(range(len(Hamming_D)), key = lambda sub: Hamming_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    output = max(set(Y_output), key = Y_output.count) 
    return output

In [262]:
def HammingDistance_Accuracy(X_Validation_HD, Y_Validation, X_train_HD, Y_train):
    count = 0
    for i in range(len(X_Validation_HD)):
        if Y_Validation[i] == HammingDistance(X_Validation_HD[i], X_train_HD, Y_train):
            count += 1
            print(count)
    print("Accuracy is: ", (count/len(Y_Validation) * 100))

# print(HammingDistance(X_test_HD[1300], X_train_HD, Y_train))
# print(Y_test[1300])
# HammingDistance_Accuracy(X_Validation_HD, Y_Validation, X_train_HD, Y_train)

In [265]:
def CalculateED(Dt,D):
    value = 0
    for i in range(len(D)):
        value += pow(Dt[i] - D[i], 2)
    value = math.sqrt(value)
    return value

In [266]:
def EuclideanDistance(Dt, X_train, Y_train, k):
    Euclidean_D = []
    for i in range(len(X_train)):
        Euclidean_D.append(CalculateED(Dt, X_train[i]))
    
    #find k minimum indices
    indices = sorted(range(len(Euclidean_D)), key = lambda sub: Euclidean_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    
    #find higher frequency result
    output = max(set(Y_output), key = Y_output.count) 
    return output

In [None]:
def EuclideanDistance_Accuracy(X_Validation_ED, Y_Validation, X_train_ED, Y_train, k):
    count = 0
    for i in range(len(X_Validation_ED)):
        if Y_Validation[i] == EuclideanDistance(X_Validation_ED[i], X_train_ED, Y_train, k):
            count += 1
            print(count)
    print("Accuracy is: ", (count/len(Y_Validation) * 100))

# HammingDistance(X_Validation_HD[0], X_train_HD, Y_train)
# print(Y_Validation[0])
# EuclideanDistance_Accuracy(X_Validation_ED, Y_Validation, X_train_ED, Y_train, 5)

In [None]:
print(EuclideanDistance(X_test_ED[1400], X_train_ED.copy(), Y_train.copy(),5))
print(Y_test[1400])

In [275]:
number_of_documents = len(X_train_HD)
d_word = np.sum(X_train_HD.copy(), axis=0)
# print(sum(d_word))
# print(len(d_word))
# print(number_of_documents)
IDF = []
for i in range(len(d_word)):
    temp = math.log2(number_of_documents/d_word[i])
    if temp <= 0:
        IDF.append(0.0001)
    else:
        IDF.append(temp)

In [276]:
def setWeight(D, Dt):
    totalWords_D = sum(D)
    totalWords_Dt = sum(Dt)
    for i in range(len(D)):
#         TF_D = D[i]/totalWords_D
#         TF_Dt = Dt[i]/totalWords_Dt
        if D[i] != 0:
            D[i] = (D[i]/totalWords_D) * IDF[i]
        if Dt[i] != 0 :
            Dt[i] = (Dt[i]/totalWords_Dt) * IDF[i]
    return D,Dt

In [285]:
def CosineTheta(D , Dt):
    return np.dot(D,Dt)/(norm(D)*norm(Dt))
#     D, Dt = setWeight (D,Dt)
#     dotProduct = np.dot(D,Dt)
#     length_D = math.sqrt(np.dot(D, D))
#     length_Dt = math.sqrt(np.dot(Dt, Dt))


In [300]:
def CosineSimilarity(Dt, X_train, Y_train, k):
    Cosine_values = []
    output = []
    for D in X_train:
        Cosine_values.append(CosineTheta(D,Dt))

    for k in K:
        indices = (sorted(range(len(Cosine_values)), key = lambda sub: Cosine_values[sub])[-k:])
        Y_output = []
        for i in indices:
            Y_output.append(Y_train[i])

        output.append(max(set(Y_output), key = Y_output.count))
    
    return output

In [301]:
# print(CosineSimilarity(X_test_ED[1400].copy(), deepcopy(X_train_ED), deepcopy(Y_train), 5))
# print(Y_test[1400])
K= [1,3,5]

In [305]:
def CosineValidation_Test(X_Validation_ED, Y_Validation, X_train_ED, Y_train, K):
    count = [0,0,0]
    for i in range(len(X_Validation_ED)):
        result = CosineSimilarity(X_Validation_ED[i], X_train_ED, Y_train, K)
        for k in range(len(K)):
            if Y_Validation[i] == result[k]:
                count[k] += 1
        print(i, count)
    
    for k in range(len(K)):
        print("Accuracy is: ", (count[k]/len(Y_Validation) * 100))
#     return (count/len(Y_Validation) * 100)

CosineValidation_Test(deepcopy(X_Validation_ED), deepcopy(Y_Validation), deepcopy(X_train_ED), deepcopy(Y_train), K)

0 [1, 1, 1]
1 [2, 2, 1]
2 [3, 3, 2]
3 [4, 4, 3]
4 [5, 5, 4]
5 [6, 6, 5]
6 [7, 7, 6]
7 [8, 8, 7]
8 [9, 9, 8]
9 [10, 10, 9]
10 [11, 11, 10]
11 [11, 11, 10]
12 [12, 12, 11]
13 [13, 13, 12]
14 [14, 14, 13]
15 [15, 15, 14]
16 [16, 16, 15]
17 [17, 17, 16]
18 [18, 18, 17]
19 [19, 19, 18]
20 [20, 20, 19]
21 [21, 21, 20]
22 [22, 22, 21]
23 [23, 23, 22]
24 [24, 24, 23]
25 [25, 25, 24]
26 [26, 26, 25]
27 [27, 27, 26]
28 [28, 28, 27]
29 [29, 29, 28]
30 [30, 30, 29]
31 [31, 31, 30]
32 [32, 32, 31]
33 [33, 33, 32]
34 [34, 34, 33]
35 [35, 35, 34]
36 [36, 36, 35]
37 [37, 36, 35]
38 [38, 37, 36]
39 [39, 38, 37]
40 [40, 39, 38]
41 [41, 40, 39]
42 [42, 41, 40]
43 [43, 42, 41]
44 [44, 43, 42]
45 [45, 44, 43]
46 [46, 45, 44]
47 [47, 46, 45]
48 [48, 47, 46]
49 [49, 48, 47]
50 [50, 49, 48]
51 [51, 50, 49]
52 [52, 51, 50]
53 [53, 52, 51]
54 [54, 53, 52]
55 [55, 54, 53]
56 [56, 55, 54]
57 [57, 56, 55]
58 [58, 57, 56]
59 [59, 58, 57]
60 [60, 59, 58]
61 [61, 60, 59]
62 [62, 61, 60]
63 [63, 62, 61]
64 [64, 63, 62

432 [407, 409, 408]
433 [408, 410, 409]
434 [409, 411, 410]
435 [410, 412, 411]
436 [410, 412, 411]
437 [411, 413, 412]
438 [412, 414, 413]
439 [413, 415, 414]
440 [414, 416, 415]
441 [414, 416, 415]
442 [414, 417, 416]
443 [415, 418, 417]
444 [416, 419, 418]
445 [416, 419, 418]
446 [417, 420, 419]
447 [417, 421, 420]
448 [418, 422, 421]
449 [419, 423, 422]
450 [420, 424, 423]
451 [421, 425, 424]
452 [422, 426, 425]
453 [423, 427, 426]
454 [424, 428, 427]
455 [424, 428, 427]
456 [425, 429, 428]
457 [426, 430, 429]
458 [427, 431, 430]
459 [427, 432, 431]
460 [428, 433, 432]
461 [429, 433, 432]
462 [430, 434, 433]
463 [431, 435, 434]
464 [432, 436, 435]
465 [433, 436, 435]
466 [434, 437, 436]
467 [434, 437, 436]
468 [434, 437, 436]
469 [435, 438, 437]
470 [436, 439, 438]
471 [437, 439, 438]
472 [437, 439, 438]
473 [437, 439, 438]
474 [438, 440, 439]
475 [439, 441, 440]
476 [440, 442, 441]
477 [441, 443, 442]
478 [441, 444, 443]
479 [442, 445, 444]
480 [443, 446, 445]
481 [444, 447, 446]


In [252]:
def denominatorOfNB(Dt,alpha):
    summation = 0
    
    for i in range(len(TopicName)):
        summation += Prob_DT_CM(Dt,i, alpha)
    
    return summation

In [244]:
def Prob_Wj_Cm(word, m, alpha):
    
    frequencyOf_W_in_M = 0
    TotalWords_in_M = len(CM_list[m])
    
    for i in range(len(CM_list[m])):
        frequencyOf_W_in_M += CM_list[m][i].count(word)
        
    value = (frequencyOf_W_in_M + alpha)/ (TotalWords_in_M + alpha * len(FeatureSpace))
    
    if value == 0:
        print(m, "e jhamela ache ", value, "->", word)

    return value

In [245]:
def Prob_DT_CM(Dt, m, alpha):
    P_Dt_Cm = 1
    
    for i in range(len(Dt)):
        if(Dt[i] != 0):
            P_Dt_Cm *= Prob_Wj_Cm(FeatureSpace[i],m, alpha)
#             print(P_Dt_Cm)
    
#     print("probab of DT being in ", m, " is ", P_Dt_Cm)
    return P_Dt_Cm

In [249]:
def NaiveBayes(Dt, m, denom, alpha):
    value = (Prob_DT_CM(Dt,m,alpha) * (1/len(CM_list)))/ (denom + alpha*len(FeatureSpace))
    return value

In [259]:
# Probability = []
# denom = denominatorOfNB(X_Validation_ED[6])
# # for i in range(len(CM_list)):
# #     Probability.append(NaiveBayes(X_Validation_ED[6],i, denom))

# # # print(Probability)    
# # print(TopicName[Probability.index(max(Probability))][0])
# print(denom)

# print(len(X_Validation_HD[6]))
# # len(CM_list)

x = [2, 1.5, 0.2] 
print(x)

[2, 1.5, 0.2]


In [255]:
def NB_precision(X_Validation_HD, Y_Validation, alpha):
    count = 0
    for itr in range(len(X_Validation_HD)):
        Probability = []
        denom = denominatorOfNB(X_Validation_HD[itr], alpha)
#         if denom == 0:
#             continue
        for i in range(len(CM_list)):
            Probability.append(NaiveBayes(X_Validation_HD[itr],i, denom, alpha))
        
        prediction = TopicName[Probability.index(max(Probability))][0]
        if prediction == Y_Validation[itr]:
            count += 1
#             print(count)
    
    print("Accuracy is: ", (count/len(Y_Validation) * 100)) 

# NB_precision(X_Validation_HD, Y_Validation, 0.01)



In [261]:
for alpha in x:
    NB_precision(X_Validation_HD, Y_Validation, alpha)
    print(alpha)
    print("--------------------------")

Accuracy is:  91.66666666666666
2
--------------------------
Accuracy is:  91.83333333333333
1.5
--------------------------
Accuracy is:  95.66666666666667
0.2
--------------------------


In [137]:
TopicName = []
x=[d.keys() for d in Training_set]
for keys in x:
    if list(keys) not in TopicName:
        TopicName.append(list(keys))

CM_list = []

for j in range(len(TopicName)):
    CM_list.append([])
    for i in range(len(Training_set)):
        if list(Training_set[i].keys()) == TopicName[j]:
            y = list(Training_set[i].values())[0]
            CM_list[j].append(y)