In [178]:
import numpy as np
import math
from copy import copy, deepcopy
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup as bs

In [179]:
inputs = []
with open('Data/topics.txt','r',encoding='utf-8') as file:
    for line in file:
        line = line[:-1]
        inputs.append(line)
    print(inputs)

['Coffee', 'Arduino', 'Anime']


In [180]:
def Text_Preprocessing(input_type):
    input_file = "Data/Training/"+ input_type + ".xml"
    array = []
    with open(input_file,'r',encoding='utf-8') as file:
        content = file.read()
        soup = bs(content)
        for items in soup.findAll("row"):
            Document = dict()
            body = items['body']

            # removing tags and numbers using regex        
#             body = re.sub(r'<[^>]*>','', body)
            body = re.sub(r'[-+]?\d+', '', body)
            body = re.sub(r'[^\x00-\x7F]', ' ', body)

            #Lowercase the text
            body = body.lower()

            #Remove punctuations
            body = body.translate((str.maketrans('','',string.punctuation)))

            #Tokenize
            body = word_tokenize(body)

            #Remove stopwords
            stop_words = set(stopwords.words('english'))
            body = [word for word in body if not word in stop_words]

            #Lemmatize tokens
            lemmatizer = WordNetLemmatizer()
            body = [lemmatizer.lemmatize(word) for word in body]

            #Stemming tokens
            stemmer= PorterStemmer()
            body = [stemmer.stem(word) for word in body]
            
            if not body:
                continue
            
            Document[input_type] = body
            
            array.append(Document)
            #print(body, "\n\n\n")
        #print(array)
        return array

In [181]:
Training_set = []
Validation_set = []
Test_set = []
for input_file in inputs:
    array = Text_Preprocessing(input_file)
    Training_set += array[:500]
    Validation_set += array[500:500+200]
    Test_set += array[700:1200]

In [182]:
print(len(Training_set))
print(len(Validation_set))
print(len(Test_set))
# print(Training_set)

1500
600
1500


In [183]:
FeatureSpace = []
for document in Training_set:
    temp = list(document.values())
    for i in temp[0]:
        if i not in FeatureSpace:
            FeatureSpace.append(i)
print(len(FeatureSpace))
# print(FeatureSpace)

17306


In [204]:
Y_train = []
X_train_HD = []
X_train_ED = []

X_Validation_HD = []
X_Validation_ED = []
Y_Validation =[]

X_test_HD = []
X_test_ED = []
Y_test = []

for i in range(len(Training_set)):
    Y_train.append(list(Training_set[i].keys())[0])
    temp = list(Training_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_train_HD.append(HD)
    X_train_ED.append(ED)

for i in range(len(Test_set)):
    Y_test.append(list(Test_set[i].keys())[0])
    temp = list(Test_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_test_HD.append(HD)
    X_test_ED.append(ED)
    
    
for i in range(len(Validation_set)):
    Y_Validation.append(list(Validation_set[i].keys())[0])
    temp = list(Validation_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_Validation_HD.append(HD)
    X_Validation_ED.append(ED)

In [213]:
print(len(X_Validation_HD[0]))

17306


In [215]:
def getDistance(Dt,D1):
    count = 0
    for i in range(len(D1)):
        if D1[i] != Dt[i]:
            count += 1
    return count

In [250]:
def HammingDistance(Dt, X_train, Y_train, k = 3):
    Hamming_D = []
    for i in range(len(X_train)):
        Hamming_D.append(getDistance(Dt, X_train[i]))
        
#     print(Hamming_D)
    
    indices = sorted(range(len(Hamming_D)), key = lambda sub: Hamming_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    output = max(set(Y_output), key = Y_output.count) 
    return output

In [252]:
def HammingDistance_Accuracy(X_Validation_HD, Y_Validation, X_train_HD, Y_train):
    count = 0
    for i in range(len(X_Validation_HD)):
        if Y_Validation[i] == HammingDistance(X_Validation_HD[i], X_train_HD, Y_train):
            count += 1
            print(count)
    print("Accuracy is: ", (count/len(Y_Validation) * 100))

# HammingDistance(X_Validation_HD[0], X_train_HD, Y_train)
# print(Y_Validation[0])
# HammingDistance_Accuracy(X_Validation_HD, Y_Validation, X_train_HD, Y_train)

In [189]:
def CalculateED(Dt,D):
    value = 0
    for i in range(len(D)):
        value += pow(Dt[i] - D[i], 2)
    value = math.sqrt(value)
    return value

In [190]:
def EuclideanDistance(Dt, X_train, Y_train, k):
    Euclidean_D = []
    for i in range(len(X_train)):
        Euclidean_D.append(CalculateED(Dt, X_train[i]))
    
    #find k minimum indices
    indices = sorted(range(len(Euclidean_D)), key = lambda sub: Euclidean_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    
    #find higher frequency result
    output = max(set(Y_output), key = Y_output.count) 
    return output

In [None]:
def EuclideanDistance_Accuracy(X_Validation_ED, Y_Validation, X_train_ED, Y_train, k):
    count = 0
    for i in range(len(X_Validation_ED)):
        if Y_Validation[i] == EuclideanDistance(X_Validation_ED[i], X_train_ED, Y_train, k):
            count += 1
    print("Accuracy is: ", (count/len(Y_Validation) * 100))

# HammingDistance(X_Validation_HD[0], X_train_HD, Y_train)
# print(Y_Validation[0])
EuclideanDistance_Accuracy(X_Validation_ED, Y_Validation, X_train_ED, Y_train, 1)

In [202]:
EuclideanDistance(X_test_ED[1400], X_train_ED.copy(), Y_train.copy())
print(Y_test[1400])

Anime
Anime


In [282]:
number_of_documents = len(X_train_ED_COPY)
d_word = np.sum(X_train_HD.copy(), axis=0)
# print(sum(d_word))
# print(len(d_word))
# print(number_of_documents)
IDF = []
for i in range(len(d_word)):
    temp = math.log2(number_of_documents/d_word[i])
    if temp <= 0:
        IDF.append(0.0001)
    else:
        IDF.append(temp)

In [283]:
def setWeight(D, Dt):
    totalWords_D = sum(D)
    totalWords_Dt = sum(Dt)
    for i in range(len(D)):
        TF_D = D[i]/totalWords_D
        TF_Dt = Dt[i]/totalWords_Dt
        if D[i] != 0:
            D[i] = TF_D * IDF[i]
        if Dt[i] != 0 :
            Dt[i] = TF_Dt * IDF[i]
    return D,Dt

In [284]:
def CosineTheta(D , Dt):
    D, Dt = setWeight (D,Dt)
    dotProduct = np.dot(D,Dt)
    length_D = math.sqrt(np.dot(D, D))
    length_Dt = math.sqrt(np.dot(Dt, Dt))
    return dotProduct/(length_D*length_Dt)

In [285]:
def CosineSimilarity(Dt, X_train, Y_train, k):
    Cosine_values = []
    for D in X_train:
        Cosine_values.append(CosineTheta(D,Dt))
    indices = sorted(range(len(Cosine_values)), key = lambda sub: Cosine_values[sub])[-k:]
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])
        #find higher frequency result
    output = max(set(Y_output), key = Y_output.count)
    print(output)

In [287]:
CosineSimilarity(X_test_ED[1400].copy(), deepcopy(X_train_ED), deepcopy(Y_train), 3)
print(Y_test[1400])

Anime
Anime


In [403]:
def denominatorOfNB(Dt):
    summation = 0
    for i in range(len(TopicName)):
        summation += Prob_DT_CM(Dt,i)

In [404]:
def Prob_Wj_Cm(word,m):
    frequencyOf_W_in_M = 0
    TotalWords_in_M = 0
    for i in range(len(CM_list[m])):
        frequencyOf_W_in_M += CM_list[m][i].count(word)
        TotalWords_in_M += len(CM_list[m][i])
    return (frequencyOf_W_in_M/TotalWords_in_M)

In [405]:
def Prob_DT_CM(Dt, m):
    P_Dt_Cm = 1
    for i in range(len(Dt)):
        P_Dt_Cm *= Prob_Wj_Cm(FeatureSpace[i],m)

In [410]:
def P(m):
    Cm_count = 0
    for i in range(len(CM_list[m])):
        Cm_count += len(CM_list[m][i])
    return Cm_count/TotalWords_in_TrainingSet

In [402]:
def NaiveBayes(Dt, m):
    denom = denominatorOfNB(Dt)
    value = (Prob_DT_CM(Dt,m) * P(Cm))/denom
    return value

In [389]:
TopicName = []
x=[d.keys() for d in Training_set]
for keys in x:
    if list(keys) not in TopicName:
        TopicName.append(list(keys))
CM_list = []
for j in range(len(TopicName)):
    CM_list.append([])
    for i in range(len(Training_set)):
        if list(Training_set[i].keys()) == TopicName[0]:
            y = list(Training_set[i].values())[0]
            CM_list[j].append(y)

In [406]:
# for i in range(len(CM_list[0])):
print(len(CM_list))

3


In [407]:
TotalWords_in_TrainingSet = 0
for i in range(len(CM_list)):
    for j in range(len(CM_list[i])):
        TotalWords_in_TrainingSet += len(CM_list[i][j])

118560
