In [1]:
from os import listdir
from os.path import isfile, join
import string
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import re

In [2]:
my_path = '20_newsgroups'

#creating a list of folder names to make valid pathnames later
folders = [f for f in listdir(my_path)]
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
# Creating a 2D list to store list of all files in different folders

files = []
for folder_name in folders:
    folder_path = join(my_path, folder_name)
    files.append([f for f in listdir(folder_path)])

In [4]:
# Creating a list containig the path to all the files

pathname_list = []
for fo in range(len(folders)):
    for fi in files[fo]:
        pathname_list.append(join(my_path, join(folders[fo], fi)))

len(pathname_list)

19997

In [5]:
# All the files have irrelevant metadata at the begining
# This will remove the metadata to improve accuraccy

def remove_metadata(lines):
    for i in range(len(lines)):
        if(lines[i] == '\n'):
            start = i+1
            break
    new_lines = lines[start:]
    return new_lines

In [6]:
X =[]  # A list of doc_data
Y = [] # A list of the category of doc
for path in pathname_list :
    with open(path,'r') as f:
        category=path.split('\\')[1]
        new_lines = remove_metadata(f.readlines())
        st=''
        for line in new_lines:
            for i in line:
                st+=i
        X.append(st)
        Y.append(category)

In [7]:
# train test split
x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=0)

In [8]:
print(len(x_train))
print(len(x_test))

14997
5000


In [9]:
from nltk.corpus import stopwords
stop_word=stopwords.words('english') # Common stop word

In [10]:
# adding the commonly occuring word in the files to stop words
stop_word+= ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at',
 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 
 'can', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during',
 'each', 'few', 'for', 'from', 'further', 
 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself',
 "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself',
 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours' 'ourselves', 'out', 'over', 'own',
 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 
 'than', 'that',"that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", 
 "they'll", "they're", "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 
 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
 "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",'will', 'with', "won't", 'would', "wouldn't", 
 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves', 
 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'hundred', 'thousand', '1st', '2nd', '3rd',
 '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [11]:
dic={} # Dictionary storing all  words and its count
for i in range(len(x_train)):
    lower_word=x_train[i].lower()
    splited_word=re.split(r'\W+',lower_word)
    for s in splited_word:
        if not(s.isalpha()) or s in stop_word or len(s)<=2:
            continue
        if s in dic:
            dic[s]+=1
        else:
            dic[s]=1
    dic

In [12]:
sorted_dic = sorted(dic.items(), key=lambda kv: kv[1],reverse=True)

In [13]:
len(sorted_dic)

89142

In [14]:
freq= []
wrds=[]
for i in sorted_dic:
    wrds.append(i[0])
    freq.append(i[1])

### Ploting graph to determine number of features

In [15]:
f_o_w = []  # frequency of words
n_o_w = []  # num of words
for f in sorted(np.unique(freq), reverse=True):
    f_o_w.append(f)
    n_o_w.append(freq.count(f))

In [16]:
pd.DataFrame(np.column_stack((n_o_w,f_o_w)))

Unnamed: 0,0,1
0,1,16638
1,1,11009
2,1,9936
3,1,9411
4,1,7646
5,1,7542
6,1,6612
7,1,6339
8,1,5910
9,1,5581


In [17]:
import matplotlib.pyplot as plt
y = f_o_w
x = n_o_w
plt.ylim(0,50)
plt.xlim(0,20000)
plt.xlabel("No. of words")
plt.ylabel("Freq. of words")
plt.plot(x, y)
plt.grid()
plt.show()

<Figure size 640x480 with 1 Axes>

### Choosing the feature based on frequency-Number plot

In [18]:
# finding total number of words with frequency greater than 50:
n_features = sum(n_o_w[:f_o_w.index(50)])
features = wrds[0:n_features]
print(features)



In [19]:
# Making X_train with the chosen features
x=np.zeros((len(x_train),len(features)))
for i in range(len(x_train)):
    lower_word=x_train[i].lower()
    splited_word=re.split(r'\W+',lower_word)
    for j in splited_word:
        if j in features:
            x[i][features.index(j)] +=1

In [20]:
x_train=x
x_train.shape

(14997, 5790)

In [21]:
# Making X_test with the chosen features
x=np.zeros((len(x_test),len(features)))
for i in range(len(x_test)):
    lower_word=x_test[i].lower()
    splited_word=re.split(r'\W+',lower_word)
    for j in splited_word:
        if j in features:
            x[i][features.index(j)] +=1

In [22]:
x_test=x
x_test.shape

(5000, 5790)

In [23]:
y_train=np.array(y_train)
y_test=np.array(y_test)

In [24]:
y_train.shape,y_test.shape

((14997,), (5000,))

## Text Classification 
### performing Text Classification using sklearn's Multinomial Naive Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train )

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
y_predict = clf.predict(x_test)

#### Testing Score

In [27]:
clf.score(x_test, y_test)

0.7854

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, y_predict))

                          precision    recall  f1-score   support

             alt.atheism       0.67      0.77      0.72       233
           comp.graphics       0.61      0.69      0.65       253
 comp.os.ms-windows.misc       0.76      0.65      0.70       249
comp.sys.ibm.pc.hardware       0.62      0.73      0.67       240
   comp.sys.mac.hardware       0.71      0.81      0.76       236
          comp.windows.x       0.83      0.71      0.77       240
            misc.forsale       0.81      0.75      0.78       261
               rec.autos       0.84      0.83      0.84       269
         rec.motorcycles       0.86      0.93      0.89       284
      rec.sport.baseball       0.90      0.91      0.90       248
        rec.sport.hockey       0.89      0.95      0.92       231
               sci.crypt       0.92      0.89      0.91       233
         sci.electronics       0.82      0.72      0.77       244
                 sci.med       0.91      0.88      0.90       256
         

In [29]:
conf_mat= pd.DataFrame(data=confusion_matrix(y_test, y_predict),index=folders)

conf_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
alt.atheism,179,2,0,0,0,0,0,1,3,1,0,0,0,0,0,14,2,1,5,25
comp.graphics,1,174,17,25,8,7,2,1,0,1,0,3,0,3,6,1,0,0,1,3
comp.os.ms-windows.misc,0,20,162,26,9,19,2,0,0,1,2,1,2,0,3,0,0,0,2,0
comp.sys.ibm.pc.hardware,1,7,11,176,24,0,11,4,1,0,0,0,5,0,0,0,0,0,0,0
comp.sys.mac.hardware,0,4,2,18,190,0,8,3,1,0,2,0,4,2,2,0,0,0,0,0
comp.windows.x,0,36,11,5,3,171,3,1,1,0,1,2,1,0,2,0,0,0,3,0
misc.forsale,0,3,1,12,9,1,196,7,6,2,7,3,10,0,2,0,0,0,2,0
rec.autos,0,2,1,0,3,1,8,224,15,2,1,0,6,0,2,0,3,0,0,1
rec.motorcycles,0,1,0,1,2,0,2,7,265,0,1,0,2,1,0,0,1,0,1,0
rec.sport.baseball,0,2,2,0,0,1,3,2,2,225,5,0,0,1,0,1,2,0,2,0


#### Training score

In [30]:
y_predict_tr = clf.predict(x_train)

In [31]:
clf.score(x_train, y_train)

0.8539707941588318

In [32]:
print(classification_report(y_train, y_predict_tr))

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.86      0.80       767
           comp.graphics       0.71      0.80      0.75       747
 comp.os.ms-windows.misc       0.88      0.77      0.82       751
comp.sys.ibm.pc.hardware       0.76      0.84      0.79       760
   comp.sys.mac.hardware       0.79      0.90      0.84       764
          comp.windows.x       0.90      0.82      0.86       760
            misc.forsale       0.88      0.85      0.87       739
               rec.autos       0.90      0.90      0.90       731
         rec.motorcycles       0.88      0.96      0.92       716
      rec.sport.baseball       0.95      0.95      0.95       752
        rec.sport.hockey       0.93      0.96      0.94       769
               sci.crypt       0.94      0.92      0.93       767
         sci.electronics       0.87      0.81      0.84       756
                 sci.med       0.95      0.89      0.92       744
         

In [33]:
conf_mat= pd.DataFrame(data=confusion_matrix(y_train, y_predict_tr),index=folders)

conf_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
alt.atheism,663,2,0,0,0,0,0,1,4,3,1,1,1,2,3,25,2,6,6,47
comp.graphics,1,597,22,34,29,22,5,2,2,1,4,4,7,5,8,0,1,0,1,2
comp.os.ms-windows.misc,1,38,581,60,12,22,9,5,1,3,7,3,3,0,0,0,0,1,1,4
comp.sys.ibm.pc.hardware,0,13,20,635,45,5,13,3,3,1,3,2,14,0,2,0,0,1,0,0
comp.sys.mac.hardware,1,10,11,22,688,4,11,0,1,0,2,1,8,0,4,0,0,0,1,0
comp.windows.x,0,71,11,13,7,623,4,1,3,2,5,2,3,1,7,1,2,0,2,2
misc.forsale,0,4,5,27,22,0,631,16,2,0,7,3,7,2,6,1,5,0,1,0
rec.autos,2,4,0,4,2,1,13,661,20,4,2,1,9,2,1,1,2,0,2,0
rec.motorcycles,1,2,0,0,2,1,7,9,685,1,3,0,2,0,0,0,1,0,1,1
rec.sport.baseball,0,4,0,1,3,3,4,3,7,712,13,0,1,0,0,0,1,0,0,0


### Performing Text Classification using my implementation Multinomial Naive Bayes

In [34]:
# making dictionary for  Multinomial Naive Bayes implementation
def fit(X_train, Y_train):
    result = {}
    classes, counts = np.unique(Y_train, return_counts=True)
    
    for i in range(len(classes)):
        curr_class = classes[i]
        
        result["TOTAL_DATA"] = len(Y_train)
        result[curr_class] = {}
        
        X_tr_curr = X_train[Y_train == curr_class]
        
        num_features = n_features
        
        for j in range(num_features):
            result[curr_class][features[j]] = X_tr_curr[:,j].sum() 
                
        result[curr_class]["TOTAL_COUNT"] = counts[i]
    
    return result

In [35]:
def  log_probablity(dictionary_train, x, curr_class):
    prob=np.log(dictionary_train[curr_class]["TOTAL_COUNT"]) - np.log(dictionary_train["TOTAL_DATA"])
    feature=list(dictionary_train[curr_class].keys()) 
    for j in range (len(feature)-2):
        xj=x[j]
        if xj==0:
            current_prob=0
        else:
            count_curr_class_equal_xj = dictionary_train[curr_class][feature[j]]+1
            # -2 in count_curr_class because we have "TOTAL_COUNT" and "TOTAL_DATA"" in the dictionary
            count_curr_class = dictionary_train[curr_class]["TOTAL_COUNT"]+len(dictionary_train[curr_class].keys())-2
            current_prob = np.log(count_curr_class_equal_xj)-np.log(count_curr_class)
        prob+=current_prob
    return prob

In [36]:
def predictSinglePoint(dictionary_train, x):
    classes = dictionary_train.keys()
    best_p = -10000
    best_class = -1
    for curr_class in classes:
        if(curr_class == "TOTAL_DATA"):
            continue
        p_curr_class = log_probablity(dictionary_train, x, curr_class)
        if(p_curr_class > best_p):
            best_p = p_curr_class
            best_class = curr_class
            
    return best_class

In [37]:
def predict(dictionary_train, X_test):
    Y_pred = []
    for x in X_test:
        y_predicted = predictSinglePoint(dictionary_train, x)
        Y_pred.append(y_predicted)

    return Y_pred

In [38]:
train_dictionary = fit(x_train, y_train)

In [39]:
y_pred=predict(train_dictionary, x_test)

#### testing score

In [40]:
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.69      0.70      0.70       233
           comp.graphics       0.51      0.61      0.55       253
 comp.os.ms-windows.misc       0.90      0.28      0.42       249
comp.sys.ibm.pc.hardware       0.63      0.58      0.61       240
   comp.sys.mac.hardware       0.93      0.36      0.52       236
          comp.windows.x       0.50      0.82      0.62       240
            misc.forsale       0.84      0.32      0.46       261
               rec.autos       0.82      0.42      0.55       269
         rec.motorcycles       0.98      0.42      0.59       284
      rec.sport.baseball       0.97      0.63      0.76       248
        rec.sport.hockey       0.85      0.88      0.86       231
               sci.crypt       0.56      0.86      0.67       233
         sci.electronics       0.76      0.35      0.48       244
                 sci.med       0.91      0.68      0.78       256
         

In [41]:
conf_mat= pd.DataFrame(data=confusion_matrix(y_test, y_pred),index=folders)

conf_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
alt.atheism,164,0,0,0,0,0,0,0,0,0,0,1,0,0,0,17,0,13,17,21
comp.graphics,1,155,3,9,0,39,0,0,0,0,2,12,0,0,5,4,0,5,14,4
comp.os.ms-windows.misc,2,35,69,12,0,83,0,0,0,0,2,15,1,0,6,0,1,1,22,0
comp.sys.ibm.pc.hardware,2,21,4,140,1,19,5,0,0,0,0,24,4,1,1,0,1,2,15,0
comp.sys.mac.hardware,1,21,0,28,84,26,3,0,1,0,2,18,7,4,2,1,0,2,35,1
comp.windows.x,0,18,0,1,0,196,0,0,0,0,1,7,0,0,1,0,0,3,13,0
misc.forsale,0,19,1,24,5,9,83,10,0,0,6,17,9,1,9,2,4,11,51,0
rec.autos,1,2,0,0,0,2,6,112,0,1,2,3,2,2,5,0,5,22,103,1
rec.motorcycles,2,0,0,1,0,1,1,14,119,0,0,4,3,3,1,2,4,26,100,3
rec.sport.baseball,2,2,0,0,0,1,0,0,0,156,13,2,0,3,0,1,4,14,50,0


In [43]:
accuracy_score(y_test, y_pred)

0.605