In [1]:
import numpy as np
from scipy.spatial import distance
import math
import random
from copy import copy, deepcopy
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup as bs
from numpy.linalg import norm
import time
from scipy import stats

In [33]:
def inputFiles():
    inputs = []
    with open('Data/topics.txt','r',encoding='utf-8') as file:
        for line in file:
            line = line[:-1]
            inputs.append(line)
        print(inputs)
    return inputs

In [34]:
def Text_Preprocessing(input_type):
    input_file = "Data/Training/"+ input_type + ".xml"
    array = np.zeros(0)
    with open(input_file,'r',encoding='utf-8') as file:
        content = file.read()
        soup = bs(content,'lxml')
        for items in soup.findAll("row"):
            Document = dict()
            body = items['body']
               
            #removing links
            body = re.sub(r"<a.*</a>","", body)
            
            #removing tags
            body = re.sub("<.[^>]*>","", body)
            
            #removing unicode
            body = re.sub(r'[^\x00-\x7F]', ' ', body)
            
            #removing numbers
            body = re.sub(r'[-+]?\d+', '', body)

            #Lowercase the text
            body = body.lower()

            #Remove punctuations
            body = body.translate((str.maketrans('','',string.punctuation)))

            #Tokenize
            body = word_tokenize(body)

            #Remove stopwords
            stop_words = set(stopwords.words('english'))
            body = [word for word in body if not word in stop_words]

            #Lemmatize tokens
            lemmatizer = WordNetLemmatizer()
            body = [lemmatizer.lemmatize(word) for word in body]

            #Stemming tokens
            stemmer= PorterStemmer()
            body = [stemmer.stem(word) for word in body]
            
            if not body:
                continue
            
            Document[input_type] = body
            
            array = np.append(array,Document)
            #print(body, "\n\n\n")
        #print(array)
        return array

In [35]:
def set_dataSets(inputs):
    Training_set = np.zeros(0)
    Validation_set = np.zeros(0)
    Test_set = np.zeros(0)
    for input_file in inputs:
        print(input_file)
        array = Text_Preprocessing(input_file)
        Training_set = np.append(Training_set, array[:500])
        Validation_set = np.append(Validation_set,array[500:500+200])
        Test_set = np.append(Test_set,array[700:1200])
    return Training_set, Validation_set, Test_set

In [36]:
print(len(Training_set))
print(len(Validation_set))
print(len(Test_set))
# print(Training_set)

5500
2200
5500


In [37]:
def featureSpace_Set(Training_set):
    FeatureSpace = []
    for document in Training_set:
        temp = list(document.values())
        for i in temp[0]:
            if i not in FeatureSpace:
                FeatureSpace.append(i)
    print(len(FeatureSpace))
    return FeatureSpace
# print(FeatureSpace)

In [38]:
def documentsSet (Training_set,Validation_set, FeatureSpace):
    
    Y_train = []
    X_train_HD = []
    X_train_ED = []

    X_Validation_HD = []
    X_Validation_ED = []
    Y_Validation =[]

    for i in range(len(Training_set)):
        Y_train.append(list(Training_set[i].keys())[0])
        temp = list(Training_set[i].values())[0]
        HD = []
        ED = []
        for i in range(len(FeatureSpace)):
            if FeatureSpace[i] in temp:
                HD.append(1)
            else:
                HD.append(0)

            ED.append(temp.count(FeatureSpace[i]))

        X_train_HD.append(HD)
        X_train_ED.append(ED)

    print("TRAIN DONE")    

    for i in range(len(Validation_set)):
        Y_Validation.append(list(Validation_set[i].keys())[0])
        temp = list(Validation_set[i].values())[0]
        HD = []
        ED = []
        for i in range(len(FeatureSpace)):
            if FeatureSpace[i] in temp:
                HD.append(1)
            else:
                HD.append(0)

            ED.append(temp.count(FeatureSpace[i]))

        X_Validation_HD.append(HD)
        X_Validation_ED.append(ED)
        
    return X_train_ED, X_train_HD, Y_train, X_Validation_ED, X_Validation_HD,Y_Validation

In [54]:
def getDistance(Dt,D1):
    count = 0
    for i in range(len(D1)):
        if D1[i] != Dt[i]:
            count += 1
    return count

In [55]:
def HammingDistance(Dt, X_train, Y_train):
    Hamming_D = []
    for i in range(len(X_train)):
        Hamming_D.append(len(Dt)*distance.hamming(Dt, X_train[i]))
#         Hamming_D.append(getDistance(Dt, X_train[i]))
    
    output = []
#     print(Hamming_D)
    for k in K:
        indices = sorted(range(len(Hamming_D)), key = lambda sub: Hamming_D[sub])[:k] 
        Y_output = []
        for i in indices:
            Y_output.append(Y_train[i])   
    
        output.append(max(set(Y_output), key = Y_output.count))
    
    return output

In [56]:
def HammingDistance_Accuracy(X_Validation_HD, Y_Validation, X_train_HD, Y_train):
    count = [0,0,0]
    for i in range(len(X_Validation_HD)):
        result = HammingDistance(X_Validation_HD[i], X_train_HD, Y_train)
        
        for k in range(len(K)):
            if Y_Validation[i] == result[k] :
                count[k] += 1
        
        print(i, count)
        
    for k in range(len(K)):
        print("Accuracy is: ", (count[k]/len(Y_Validation) * 100))

# print(HammingDistance(X_test_HD[1200], X_train_HD, Y_train))
# print(Y_test[1200])
# HammingDistance_Accuracy(np.array(X_Validation_HD), np.array(Y_Validation), np.array(X_train_HD), np.array(Y_train))

In [58]:
def EuclideanDistance(Dt, X_train, Y_train):
    Euclidean_D = []
    for i in range(len(X_train)):
#         Euclidean_D.append(CalculateED(Dt, X_train[i]))
        Euclidean_D.append(distance.euclidean(Dt, X_train[i]))
        
    
    output = []
    for k in K:
    #find k minimum indices
        indices = sorted(range(len(Euclidean_D)), key = lambda sub: Euclidean_D[sub])[:k] 
        Y_output = []
        for i in indices:
            Y_output.append(Y_train[i])   
    
        #find higher frequency result
        output.append(max(set(Y_output), key = Y_output.count)) 
    return output

In [59]:
def EuclideanDistance_Accuracy(X_Validation_ED, Y_Validation, X_train_ED, Y_train):
    count = [0, 0, 0]
    for i in range(len(X_Validation_ED)):
        result = EuclideanDistance(X_Validation_ED[i], X_train_ED, Y_train)
        
        for k in range(len(K)):
            if Y_Validation[i] == result[k] :
                count[k] += 1
        
        print(i,count)
    
    for k in range(len(K)):
        print("Accuracy is: ", (count[k]/len(Y_Validation) * 100))

# HammingDistance(X_Validation_HD[0], X_train_HD, Y_train)
# print(Y_Validation[0])

# EuclideanDistance_Accuracy(np.array(X_Validation_ED), np.array(Y_Validation), np.array(X_train_ED), np.array(Y_train))

In [40]:
def IDF_SET(X_train_HD):
    number_of_documents = len(X_train_HD)
    d_word = np.sum(X_train_HD.copy(), axis=0)

    IDF = []
    for i in range(len(d_word)):
        temp = math.log2(number_of_documents/d_word[i])
        if temp <= 0:
            IDF.append(0.0001)
        else:
            IDF.append(temp)
    return IDF

In [41]:
def setWeightOfDocument(D, IDF):
    totalWords_D = sum(D)
    for i in range(len(D)):
        if D[i] != 0 :
            D[i] = (D[i]/totalWords_D) * IDF[i]
    return D

def SetAllWeightOfSET(X_train, IDF):
#     count = 0
    for D in X_train:
        D = setWeightOfDocument(D, IDF)
    return X_train

In [11]:
X = SetAllWeightOfSET(deepcopy(X_train_ED))
V = SetAllWeightOfSET(deepcopy(X_Validation_ED))

In [None]:
def CosineTheta(D , Dt):
    return np.dot(D,Dt)/(norm(D)*norm(Dt))

In [12]:
def CosineSimilarity(Dt, X_train, Y_train):
    Cosine_values = []
    output = []
    for D in X_train:
        Cosine_values.append(1-distance.cosine(Dt,D))

    for k in K:
        indices = (sorted(range(len(Cosine_values)), key = lambda sub: Cosine_values[sub])[-k:])
        Y_output = []
        for i in indices:
            Y_output.append(Y_train[i])
    
        output.append(max(set(Y_output), key = Y_output.count)) 
#     print(output)
    return output

In [39]:
def CosineValidation_Test(X_Validation_ED, Y_Validation, X_train_ED, Y_train)    
    count = [0,0,0]
#     count = 0
    for i in range(len(X_Validation_ED)):
        result = CosineSimilarity(X_Validation_ED[i], X_train_ED, Y_train)
        
        for k in range(len(K)):
            if Y_Validation[i] == result[k]:
                count[k] += 1
#         if Y_Validation[i] ==  CosineSimilarity(X_Validation_ED[i], X_train_ED, Y_train):
#             count+= 1
        
        print(i, count)
    
    for k in range(len(K)):
        print("Accuracy is: ", (count[k]/len(Y_Validation) * 100))
    
#     return (count/len(Y_Validation) * 100)

CosineValidation_Test(np.array(V), deepcopy(np.array(Y_Validation)), np.array(X), deepcopy(np.array(Y_train)))

In [66]:
def denominatorOfNB(Dt,alpha):
    summation = 0
    
    for i in range(len(TopicName)):
        summation += Prob_DT_CM(Dt,i, alpha)
    
    return summation

In [67]:
def Prob_Wj_Cm(word, m, alpha):
    
    frequencyOf_W_in_M = 0
    TotalWords_in_M = len(CM_list[m])
#     print(TotalWords_in_M)
#     print(len(FeatureSpace))
    
    for i in range(len(CM_list[m])):
        frequencyOf_W_in_M += CM_list[m][i].count(word)
        
    value = (frequencyOf_W_in_M + alpha)/ (TotalWords_in_M + alpha * len(FeatureSpace))
    
    if value == 0:
        print(m, "e jhamela ache ", value, "->", word)

    return value

In [68]:
def Prob_DT_CM(Dt, m, alpha):
    P_Dt_Cm = 1.0
    
    for i in range(len(Dt)):
        if(Dt[i] != 0):
            P_Dt_Cm *= Prob_Wj_Cm(FeatureSpace[i],m, alpha)

    return P_Dt_Cm

In [69]:
def NaiveBayes(Dt, m, denom, alpha):
    value = (Prob_DT_CM(Dt,m,alpha) * (1/len(CM_list)))/ (denom + alpha*len(FeatureSpace))
    return value

In [70]:
def NB_precision(X_Validation_HD, Y_Validation, alpha):
    count = 0
    for itr in range(len(X_Validation_HD)):
        Probability = []
        denom = denominatorOfNB(X_Validation_HD[itr], alpha)
#         if denom == 0:
#             continue
        for i in range(len(CM_list)):
            Probability.append(NaiveBayes(X_Validation_HD[itr],i, denom, alpha))
        
        prediction = TopicName[Probability.index(max(Probability))][0]
        if prediction == Y_Validation[itr]:
            count += 1
        print(count)

    return ((count/len(Y_Validation)) * 100)

In [63]:
def getTestValue(itr, Test_set, FeatureSpace):
#     print(len(Test_set))
    X_test_HD = []
    X_test_ED = []
    Y_test = []

    # print(inputs)
    Test_Final = []
    for input_ in inputs:
#         print(input_)
        temp = [Test for Test in Test_set if list(Test.keys())[0] == input_ ]
        Test_Final += temp[itr*10:itr*10+10]

    for i in range(len(Test_Final)):
        Y_test.append(list(Test_Final[i].keys())[0])
        temp = list(Test_Final[i].values())[0]
        HD = []
        ED = []
        
        for i in range(len(FeatureSpace)):
            if FeatureSpace[i] in temp:
                HD.append(1)
            else:
                HD.append(0)
                
            ED.append(temp.count(FeatureSpace[i]))
                
        X_test_HD.append(HD)
        X_test_ED.append(ED)
        
    return X_test_HD, Y_test, X_test_ED

In [47]:
def comparison_NB_CS(Test_set,FeatureSpace,X_test_HD,Y_test, X_test_ED, X, Y_train):
    for iteration in range(50):
        X_test_HD, Y_test, X_test_ED = getTestValue(iteration, Test_set, FeatureSpace)
        print("TestData extracted")

        accuracy_NB = NB_precision(X_test_HD, Y_test, 0.005)
        print("Done iteration NB", iteration, accuracy_NB)
        with open('NaiveBayes.txt', 'a') as f:
            f.write("%s\n" %accuracy_NB)

        X_TEST = SetAllWeightOfSET(deepcopy(X_test_ED))
        print("X_Test weight set")
        accuracy_CS = CosineValidation_Test(X_TEST, deepcopy(Y_test), X, deepcopy(Y_train))
        print("Done iteration CS", iteration, accuracy_CS)    
        with open('CosineSim.txt', 'a') as f2:
            f2.write("%s\n" %accuracy_CS)

In [50]:
def calculate_TSTAT():
    NB = []
    with open('NaiveBayes.txt', 'r') as f:
        filecontents = f.readlines()
        for line in filecontents:
            current_place = line[:-1]
            NB.append(float(current_place))

    CS = []
    with open('CosineSim.txt', 'r') as f2:
        filecontents = f2.readlines()
        for line in filecontents:
            current_place = line[:-1]
            CS.append(float(current_place))
    
    
    print("Average value of Naive Bayes :", sum(NB)/len(NB))
    print("Average value of Cosine Simulation : ",sum(CS)/len(CS))
    
    significance_level = [0.005, 0.01, 0.05]
    t_stat, p_val = stats.ttest_rel(NB,CS)
    print("t-stat value = ",t_stat, "and p_value =", p_val)
    for alpha in significance_level:
        print("When alpha = ",alpha)
        if p_val > alpha:
            print('Accept null hypothesis that the means are equal.')
        else:
            print('Reject the null hypothesis that the means are equal.')
            
            

In [72]:
# inputs = inputFiles()
# Training_set, Validation_set, Test_set = set_dataSets(inputs)
# FeatureSpace = featureSpace_Set(Training_set)
# X_train_ED, X_train_HD, Y_train, X_Validation_ED, X_Validation_HD, Y_Validation = documentsSet(Training_set,Validation_set, FeatureSpace)
# K = [1,3,5]
# #for Hamming distance, k=1.3,5
# HammingDistance_Accuracy(np.array(X_Validation_HD), np.array(Y_Validation), np.array(X_train_HD), np.array(Y_train))
# #for Euclidean distance, k=1.3,5
# EuclideanDistance_Accuracy(np.array(X_Validation_ED), np.array(Y_Validation), np.array(X_train_ED), np.array(Y_train))


# # for cosine
# IDF = IDF_SET(X_train_HD)
# X = SetAllWeightOfSET(deepcopy(X_train_ED), IDF)
# V = SetAllWeightOfSET(deepcopy(X_Validation_ED), IDF)

# CosineValidation_Test(np.array(V), deepcopy(np.array(Y_Validation)), np.array(X), deepcopy(np.array(Y_train)))

# Alpha_val = [1.5, 1, 0.5, 0.2, 0.1, 0.07, 0.05, 0.01, 0.005, 0.0001] 

# TopicName = []
# x=[d.keys() for d in Training_set]
# for keys in x:
#     if list(keys) not in TopicName:
#         TopicName.append(list(keys))

# CM_list = []

# for j in range(len(TopicName)):
#     CM_list.append([])
#     for i in range(len(Training_set)):
#         if list(Training_set[i].keys()) == TopicName[j]:
#             y = list(Training_set[i].values())[0]
#             CM_list[j].append(y)

# for alpha in Alpha_val:
#     print(NB_precision(X_Validation_HD, Y_Validation, alpha))
#     print(alpha)
#     print("--------------------------")

# comparison_NB_CS(Test_set,FeatureSpace,X_test_HD,Y_test, X_test_ED, X, Y_train)

calculate_TSTAT()

Average value of Naive Bayes : 83.20000000000002
Average value of Cosine Simulation :  81.50909090909092
t-stat value =  3.3134598364347445 and p_value = 0.0017378416481416729
When alpha =  0.005
Reject the null hypothesis that the means are equal.
When alpha =  0.01
Reject the null hypothesis that the means are equal.
When alpha =  0.05
Reject the null hypothesis that the means are equal.
