In [1]:
import numpy as np
import math
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
from bs4 import BeautifulSoup as bs

In [2]:
inputs = []
with open('Data/topics.txt','r',encoding='utf-8') as file:
    for line in file:
        line = line[:-1]
        inputs.append(line)
    print(inputs)

['Coffee', 'Arduino', 'Anime']


In [3]:
def Text_Preprocessing(input_type):
    input_file = "Data/Training/"+ input_type + ".xml"
    array = []
    with open(input_file,'r',encoding='utf-8') as file:
        content = file.read()
        soup = bs(content)
        for items in soup.findAll("row"):
            Document = dict()
            body = items['body']

            # removing tags and numbers using regex        
#             body = re.sub(r'<[^>]*>','', body)
            body = re.sub(r'[-+]?\d+', '', body)
            body = re.sub(r'[^\x00-\x7F]', ' ', body)

            #Lowercase the text
            body = body.lower()

            #Remove punctuations
            body = body.translate((str.maketrans('','',string.punctuation)))

            #Tokenize
            body = word_tokenize(body)

            #Remove stopwords
            stop_words = set(stopwords.words('english'))
            body = [word for word in body if not word in stop_words]

            #Lemmatize tokens
            lemmatizer = WordNetLemmatizer()
            body = [lemmatizer.lemmatize(word) for word in body]

            #Stemming tokens
            stemmer= PorterStemmer()
            body = [stemmer.stem(word) for word in body]
            
            if body == "" : 
                print("blank")
            
            Document[input_type] = body
            
            array.append(Document)
            #print(body, "\n\n\n")
        #print(array)
        return array

In [None]:
Training_set = []
Validation_set = []
Test_set = []
for input_file in inputs:
    array = Text_Preprocessing(input_file)
    Training_set += array[:500]
    Validation_set += array[500:500+200]
    Test_set += array[700:1200]

In [None]:
# print(len(Training_set))
# print(len(Validation_set))
# print(len(Test_set))
# print(Training_set)

In [None]:
FeatureSpace = []
for document in Training_set:
    temp = list(document.values())
    for i in temp[0]:
        if i not in FeatureSpace:
            FeatureSpace.append(i)
print(len(FeatureSpace))
# print(FeatureSpace)

In [None]:
Y_train = []
X_train_HD = []
X_train_ED = []
X_test_HD = []
X_test_ED = []
Y_test = []
for i in range(len(Training_set)):
    Y_train.append(list(Training_set[i].keys())[0])
    temp = list(Training_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_train_HD.append(HD)
    X_train_ED.append(ED)

for i in range(len(Test_set)):
    Y_test.append(list(Test_set[i].keys())[0])
    temp = list(Test_set[i].values())[0]
    HD = []
    ED = []
    for i in range(len(FeatureSpace)):
        if FeatureSpace[i] in temp:
            HD.append(1)
        else:
            HD.append(0)
        
        ED.append(temp.count(FeatureSpace[i]))
        
    X_test_HD.append(HD)
    X_test_ED.append(ED)

In [None]:
for documents in X_train_ED:
    if sum(documents) == 0:
        index_ = X_train_ED.index(documents)
        del(X_train_ED[index_])
        del(Y_train[index_])
        del(X_train_HD[index_])
        
print(len(X_train_ED), len(X_train_HD), len(Y_train) )

In [None]:
def getDistance(Dt,D1):
    count = 0
    for i in range(len(D1)):
        if D1[i] != Dt[i]:
            count += 1
    return count

In [None]:
def HammingDistance(Dt, X_train, Y_train, k=5):
    Hamming_D = []
    for i in range(len(X_train)):
        Hamming_D.append(getDistance(Dt, X_train[i]))
    indices = sorted(range(len(Hamming_D)), key = lambda sub: Hamming_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    output = max(set(Y_output), key = Y_output.count) 
    print(output)

In [None]:
HammingDistance(X_train_HD[1300], X_train_HD, Y_train)
print(Y_train[1300])

In [None]:
def CalculateED(Dt,D):
    value = 0
    for i in range(len(D)):
        value += pow(Dt[i] - D[i], 2)
    value = math.sqrt(value)
    return value

In [None]:
def EuclideanDistance(Dt, X_train, Y_train, k=5):
    Euclidean_D = []
    for i in range(len(X_train)):
        Euclidean_D.append(CalculateED(Dt, X_train[i]))
    
    #find k minimum indices
    indices = sorted(range(len(Euclidean_D)), key = lambda sub: Euclidean_D[sub])[:k] 
    Y_output = []
    for i in indices:
        Y_output.append(Y_train[i])   
    
    #find higher frequency result
    output = max(set(Y_output), key = Y_output.count) 
    print(output)

In [None]:
EuclideanDistance(X_test_ED[1300], X_train_ED, Y_train)
print(Y_test[1300])

In [None]:
number_of_documents = len(X_train_ED)
# d_word = np.sum(X_train_HD, axis=0)
# print(d_word)
# print(d_word[20])
# np.argmax(d_word)
# number_of_documents
# print(len(FeatureSpace))
# sum(X_train_ED[1500])
Training_set[1102]

In [129]:
TF = []  #TF[d,w]
IDF = []
for i in range(len(X_train_ED)):
    TF.append([])
    IDF.append([])
    TotalWords = sum(X_train_ED[i])
    for words in X_train_ED[i]:
        TF[i].append(words/TotalWords)
        IDF[i].append(number_of_documents/)
    

13 0
39 1
47 2
48 3
43 4
12 5
78 6
130 7
26 8
46 9
37 10
26 11
18 12
46 13
116 14
28 15
84 16
98 17
36 18
28 19
51 20
13 21
388 22
52 23
34 24
49 25
141 26
67 27
76 28
31 29
113 30
46 31
19 32
35 33
474 34
71 35
76 36
61 37
13 38
14 39
46 40
39 41
26 42
9 43
110 44
108 45
103 46
37 47
10 48
36 49
39 50
70 51
53 52
134 53
77 54
20 55
29 56
84 57
88 58
38 59
52 60
128 61
50 62
90 63
125 64
30 65
25 66
157 67
50 68
59 69
61 70
60 71
55 72
55 73
73 74
200 75
20 76
79 77
33 78
177 79
34 80
21 81
63 82
57 83
53 84
22 85
83 86
93 87
43 88
53 89
207 90
30 91
126 92
20 93
29 94
97 95
21 96
37 97
35 98
11 99
92 100
36 101
60 102
55 103
40 104
22 105
23 106
80 107
163 108
18 109
142 110
102 111
53 112
48 113
66 114
671 115
41 116
43 117
47 118
14 119
131 120
52 121
30 122
14 123
66 124
34 125
102 126
30 127
51 128
15 129
47 130
35 131
31 132
61 133
51 134
35 135
33 136
61 137
95 138
56 139
93 140
124 141
50 142
27 143
190 144
59 145
22 146
101 147
31 148
93 149
63 150
28 151
15 152
67 153
110 154

39 1161
37 1162
82 1163
101 1164
75 1165
77 1166
15 1167
21 1168
100 1169
14 1170
62 1171
55 1172
28 1173
9 1174
33 1175
7 1176
23 1177
29 1178
74 1179
24 1180
51 1181
29 1182
27 1183
50 1184
9 1185
19 1186
37 1187
19 1188
40 1189
29 1190
27 1191
97 1192
69 1193
97 1194
58 1195
507 1196
51 1197
456 1198
237 1199
28 1200
64 1201
14 1202
65 1203
25 1204
11 1205
29 1206
75 1207
69 1208
28 1209
114 1210
20 1211
93 1212
18 1213
52 1214
327 1215
66 1216
165 1217
23 1218
99 1219
17 1220
102 1221
16 1222
73 1223
14 1224
149 1225
20 1226
61 1227
17 1228
81 1229
14 1230
32 1231
220 1232
31 1233
102 1234
17 1235
27 1236
12 1237
33 1238
196 1239
36 1240
99 1241
65 1242
107 1243
176 1244
38 1245
131 1246
48 1247
39 1248
49 1249
135 1250
62 1251
32 1252
89 1253
50 1254
0 1255


ZeroDivisionError: division by zero

In [17]:
def CosineTheta(D , Dt):
    D,Dt = SetWeight(D, Dt)
    dotProduct = np.dot(D,Dt)
    length_D = math.sqrt(dotproduct(D, D))
    length_Dt = math.sqrt(dotproduct(Dt, Dt))
    return dotProduct/(length_D*length_Dt)