In [1]:
import numpy as np
import pandas as pd
import math as ma
import itertools                                            #this is to slice the dictionary to get only max frequecvy values
from sklearn import datasets
from nltk.corpus import stopwords                           #to get list of stopwords
from sklearn import model_selection
from nltk.tokenize import word_tokenize                     #used in removing stopwords from data
stop_words = set(stopwords.words('english'))



In [2]:
def probability(dictionary,x,clas):                            # it returns the actual probability of input x over class clas
    
    count=ma.log(dictionary[clas]["count"])-ma.log(dictionary["total"])                      # it is probability of(y=class(clas))
    features_number=len(dictionary[clas].keys())-2                                           #total number of features
    for j in range(features_number):          # calculting the probabilty over each feature the later we will take log() sum of all            
        if(x[j]==0):                           #if input x have zero frequency over the feature so its probabiluty will not counted
            continue
        count_xj_in_feature_j=dictionary[clas][j]+1                        # it is the total frequency of feature j in class->clas
        count_clas_ele_in_feature=dictionary[clas]["Grand_total"]+ features_number     # it is total number of words in class->clas
        p=ma.log(count_xj_in_feature_j)-ma.log(count_clas_ele_in_feature)          #summing all small probabilities of all features
        count=count+p
    return count                                                                   # returning the probabilty
        

In [3]:
def singlecol(dictionary,x):        #singlecol gives the prediction(output) of single colum at a time
    
    best_prob=-1000                # giving any value to initialise best_prob
    best_cls=-1                    # giving any value to initialise best_cls
    classes=dictionary.keys()      #dictionary .keys have all the classes names
    val=True
    for clas in classes:           # checking probabily on one class at a time 
        if clas=="total":          # total is not a class so ignore it
            continue
        clas_p=probability(dictionary,x,clas)        # clas_p will have probability of input x for class clas
        if(val or clas_p>best_prob):
            best_prob=clas_p
            best_cls=clas
        val=False
    return best_cls                 #returns the best_cls with maximum probability

In [4]:

def predict(dictionary,xx_test):         # to obtain output list
    
    lst=[]
    for x in xx_test:                    #going through test_data row wise
        pred=singlecol(dictionary,x)     # as we get a answer by one column we are appending it to list
        lst.append(pred)
    return lst
    
    
    

In [5]:
def fit(xx_train,y_train,features):              # this function is to train algorithm over training data
    result={}                                    #we will use dictionary and create nested dictionary where needed
    classes=set(y_train)
    
    for current_class in classes:                # acceccing all classes one by one 
        result[current_class]={}
        result["total"]=len(xx_train)            # it will hold length of entire xx_train set
        current_class_rows=(y_train==current_class)            #fetching only current_class colums
        x_train_current=xx_train[current_class_rows]           #spliting x_train for only current_class
        y_train_current=y_train[current_class_rows]            #spliting y_train for only current_class
        result[current_class]["count"]=len(x_train_current)   # it will hold count of current_class (it will be used at time of calculating probability)
        features_total=xx_train.shape[1]                      #feature size is nothin but the columns of xx_train
        a=0
        for j in range(len(features)):
            result[current_class][j]=(x_train_current[:,j].sum())         #it will hold frequency of feature j
            a+=result[current_class][j]
        result[current_class]["Grand_total"]=a                            #it will hold count of entire words in current_class
        
    return result

In [6]:
news=datasets.fetch_20newsgroups()                              # loading data from datasets   to a news name dataframe
x=news.data
y=news.target

x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.1,random_state=3)         #doing spliting for train and test data

len_data=len(x_train)
dictionary=dict()                           #in this dictionary we will store frequency of each word from entire dataset by removing stop_words
for j in range(len_data):
    data=x_train[j]
    word_tokens = word_tokenize(data)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    for word in filtered_sentence:
        if word in dictionary:
            dictionary[word]+=1
        else:
            dictionary[word]=1
new_dict={}                                               #this is the reverse sorted form of dictionary used above
for key,value in sorted(dictionary.items(),key=lambda item: item[1],reverse=True):
    new_dict[key]=value
a=dict(itertools.islice(new_dict.items(),3000))           # slicing over bigger ditionary to get max  frequency 3000 data only
features=[]                                               # features is the list of keys of dictionary (a) 
for i in a.keys():
    features.append(i)
    
xx_train=np.zeros((len(x_train),len(features)))           #modifing x_train to xx_train which is 2d and have frequency of each word of features 
for i in range(len(x_train)):
    data=x_train[i]
    
    word_tokens = word_tokenize(data)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    for j in filtered_sentence:
        if j in features:
            xx_train[i][features.index(j)]+=1
            
xx_test=np.zeros((len(x_test),len(features)))             #xx_test is modified from x_test which is 2d now and have frequency format of each feature
for i in range(len(x_test)):
    data2=x_test[i]
    word_tok=word_tokenize(data2)
    
    fil_sentence = [w for w in word_tok if not w in stop_words]
    
    for j in fil_sentence:
        if j in features:
            xx_test[i][features.index(j)]+=1


            
            
            
            
final_dict=fit(xx_train,y_train,features)        #calling fit function over data xx_train and y_train  
y_pred=predict(final_dict,xx_test)               # predict will return the output list of classes  (output)
    

from sklearn.metrics import classification_report,confusion_matrix      #importing these to check correctness of y_pred(output)
print(classification_report(y_test,y_pred))                             #it will give precission and recoil data
print(confusion_matrix(y_test,y_pred))                      # it will print confusion matrix to show how is the output result
       
print()
print()
print('---------COMPARISION---------------this classification is due to sklearn library------------')
from sklearn.naive_bayes import MultinomialNB          # now doing the same fit and predict by MultinomialNB library function
arg1=MultinomialNB()
arg1.fit(xx_train,y_train)
y_pred2=arg1.predict(xx_test)
print(arg1.score(xx_test,y_test))   # getting a score of 0.72

print(classification_report(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))        

    

# the average of precision using sketch is 0.77 and due to library it is 0.74 which are nearly same so our code is giving the correct output
    
    
    
    
    



              precision    recall  f1-score   support

           0       0.85      0.85      0.85        52
           1       0.52      0.76      0.62        68
           2       1.00      0.02      0.03        60
           3       0.47      0.67      0.55        58
           4       0.55      0.85      0.67        62
           5       1.00      0.43      0.60        56
           6       0.71      0.86      0.78        57
           7       0.73      0.84      0.78        51
           8       0.85      0.87      0.86        63
           9       0.81      0.88      0.85        68
          10       0.98      0.79      0.87        56
          11       0.97      0.93      0.95        68
          12       0.64      0.63      0.64        71
          13       0.90      0.73      0.80        59
          14       0.79      0.87      0.83        47
          15       0.86      0.88      0.87        75
          16       0.92      0.86      0.89        42
          17       0.95    