In [29]:
import numpy as np                                                  
from sklearn import model_selection                                 
import itertools                                                    
import  pandas as pd                                                
from sklearn.metrics import classification_report, confusion_matrix 
from nltk.tokenize import word_tokenize                            

# IMPLEMENTING TEXT CLASSIFFICATION USING MULTINOMIAL NAIVE                                     BAYES CLASSIFIER ON NEWSGROUPS DATASET

### Getting the stop words

In [30]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))  
print(stop_words)

{'if', 'into', 'both', 'are', "couldn't", 'm', 'can', 'doing', 'ma', 'any', 'further', 'whom', 'so', 'no', "you'd", 'but', "you've", 'will', 'd', 'through', 'of', 'as', 'because', 'here', "didn't", 'other', 'they', 'did', 'my', 'won', 'don', 'above', 'those', 'why', 'than', 'their', 'i', 'shouldn', 'at', 'themselves', 'couldn', 'over', 'himself', 'this', 'with', 'haven', 'there', 'being', "should've", 'her', 'ours', 'been', 'having', "mightn't", "aren't", 'hadn', 'you', 'she', 'weren', 'what', 'to', 'wouldn', 'when', 'down', 't', 'own', 'isn', "mustn't", 'our', 'and', 'nor', 'very', 'while', 'from', 'out', 'by', 'll', "you're", 'herself', "hadn't", "haven't", "shan't", 'just', "wouldn't", 'who', 'off', 'yourself', 'such', 'most', 'during', 'mustn', 'do', 'or', 'again', 'o', 'does', 'only', 'aren', 'doesn', 'we', 'me', 'in', 've', 'these', 'them', 'has', 'didn', 'few', 'had', 'needn', "weren't", "wasn't", 'after', 'yours', 'where', 'on', 'once', 'too', 'before', 'how', 'be', 'below', 's

### Getting the data and splitting into training and testing data

In [31]:
from sklearn import datasets
news=datasets.fetch_20newsgroups()                                                  
X_train,X_test,Y_train,Y_test=model_selection.train_test_split(news.data,news.target,test_size=0.1)
X_train

["Subject: Re: Americans and Evolution\nFrom: rfox@charlie.usd.edu (Rich Fox, Univ of South Dakota)\nReply-To: rfox@charlie.usd.edu\nOrganization: The University of South Dakota Computer Science Dept.\nNntp-Posting-Host: charlie\nLines: 26\n\nIn article <1pik3i$1l4@fido.asd.sgi.com>, livesey@solntze.wpd.sgi.com (Jon Livesey) writes:\n>In article <C4u51L.8Bv@darkside.osrhe.uoknor.edu>, bil@okcforum.osrhe.edu (Bill Conner) writes:\n>|>\n>|> \n>|> Why do you spend so much time posting here if your atheism is so\n>|> incidental, if the question of God is trivial? Fess up, it matters to\n>|> you a great deal.\n>\n>Ask yourself two questions.\n>\n>\t1.   How important is Mithras in your life today?\n>\n>\t2.   How important would Mithras become if there was a\n>\t     well funded group of fanatics trying to get the\n>\t     schools system to teach your children that Mithras\n>\t     was the one true God?\n>\n>jon.\n\nRight on, Jon!  Who cares who or whose, as long as it works for the individ

In [32]:
Y_train

array([ 0, 10, 11, ...,  2,  9,  4])

### Getting our feature set using X_train

In [33]:
def get_features(X_train,stop_words):
    d={}
    #filtering stop words
    for i in range(len(X_train)):                                   
        data=X_train[i]                                                 
        tokens=word_tokenize(data)                                        
        for ele in tokens:   
            if (ele not in stop_words) and (len(ele)>2) and (ele.isalpha()):
                d[ele]=d.get(ele,0)+1 
                        
    #sorting the dictionary in decresing order of count
    new_d={}                                                         
    for key,value in sorted(d.items(),key=lambda k:k[1],reverse=True):
        new_d[key]=value
    
    #taking the top 5000 words
    new_d=dict(itertools.islice(new_d.items(),5000))  
    
    #getting the feature set
    feature_set=[]
    for i in new_d:
        feature_set.append(i)
        
    return feature_set

In [34]:
features=get_features(X_train,stop_words)
print(features)



### Updating X_train according to our fetaure set

In [35]:
def updateRow(words,features,i):                                   
    l=np.array([0]*5000)                                                                                                 
    for ele in words:
        if ele in features:
            l[features.index(ele)]=words.count(ele)
    return l

In [38]:
def updateXTrain(X_train,features):
    X_train_updated=pd.DataFrame(np.zeros((len(X_train),5000)))     
    X_train_updated.columns=features                               
    for i in range(len(X_train)):                                   
        x=X_train[i]                                           
        words=word_tokenize(x)                                 
        X_train_updated.iloc[i,:]=updateRow(words,features,i)      
    return X_train_updated                                          

In [39]:
new_X_train=updateXTrain(X_train,features)                      
new_X_train.columns

Index(['The', 'From', 'Subject', 'Lines', 'Organization', 'would', 'writes',
       'one', 'article', 'people',
       ...
       'dust', 'interview', 'trained', 'trend', 'boss', 'subsequent',
       'frequent', 'largest', 'Live', 'powered'],
      dtype='object', length=5000)

## OWN IMPLEMENTATION OF NAIVE BAYES

### Function to fit training data into our model

In [40]:
def fit(X_train, Y_train):                                          
    result = {}                                                     
    class_values = set(Y_train) 
    for current_class in class_values:                              
        result[current_class] = {}                                 
        result["total"] = len(Y_train)                              # Total elements present in the dictionary
        current_class_rows = (Y_train == current_class)             
        X_train_current = X_train[current_class_rows]               # Filtering the x_train for current class
        Y_train_current = Y_train[current_class_rows]               # Filtering the y_train for current class
        num_features = X_train.shape[1]                             
        result[current_class]["count"] = len(Y_train_current)       # Total number of features in our data
        a=0                                                          
        for j in range(num_features):                               
            result[current_class][j]=X_train_current[:,j].sum()     # Total count of current feature
            a+=result[current_class][j]                             
        result[current_class]['total']=a                            
    return result                                                   


### Function to get probability for x belonging to the current class

In [41]:
def probability(dictionary, x, current_class): 
    # Using log pobability
    output = np.log(dictionary[current_class]["total"]) - np.log(dictionary["total"])
    num_features = len(dictionary[current_class].keys()) - 2
    for j in range( num_features ):
        if x[j]==0:                                                  # Handling the zero probability
            continue
        count_current_class_with_value_j = dictionary[current_class][j]+1
        count_current_class = dictionary[current_class]['total']+num_features
        current_j_probablity = np.log(count_current_class_with_value_j) - np.log(count_current_class)
        output = output + current_j_probablity
    return output

### Function to predict class for a single row

In [42]:
def doSinglePrediction(x,dictionary):                                  
    classes = dictionary.keys()                                        
    best_p = -100                                                      
    best_class = -100 
    first_run = True                                                   
    for current_class in classes:                                      
        if (current_class == "total"):                                 # Ignoring the 'total' key
            continue
        p_current_class = probability(dictionary, x, current_class)    # Get probablity for x belonging to current class
        if (first_run or p_current_class > best_p):                    
            best_p = p_current_class                                    
            best_class = current_class
        first_run = False                                              
    return best_class                                                

### Function  to predict Y for test data

In [43]:
def predict(x_test,d,features):                                       # Function to predict the output values for test data
    y_pred=np.zeros(len(x_test))                                        # We will use this numpy array to store predictions
    x_test_updated=updateXTrain(x_test,features)                        # Update x_test so that our model can operate on it
    x_test_updated=np.array(x_test_updated)                             # Change pandas dataframe to numpy array
    for i in range(len(x_test)):                                                                                                        # Extract one-one row from data
        curr_x=x_test_updated[i,:]                                      # Extract one-one row 
        y_pred[i]=doSinglePrediction(curr_x,d)                          # Predict class for current row, store the prediction
    return y_pred                                                       # Return predictions


### Calling fit function to get our dictionary

In [45]:
updated_X_train=np.array(new_X_train)                           
dictionary=fit(updated_X_train,Y_train)                                      

### Calling predict function to get y_pred

In [47]:
y_pred=predict(X_test,dictionary,features)     
y_pred

array([ 3., 16., 13., ...,  0., 17.,  6.])

### Classification report for the data

In [48]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        46
           1       0.59      0.77      0.67        57
           2       0.86      0.64      0.74        67
           3       0.61      0.60      0.61        60
           4       0.67      0.79      0.72        58
           5       0.78      0.61      0.68        51
           6       0.68      0.90      0.77        58
           7       0.82      0.82      0.82        61
           8       0.82      0.92      0.87        53
           9       0.91      0.93      0.92        68
          10       1.00      0.88      0.94        59
          11       0.93      0.88      0.90        59
          12       0.74      0.73      0.74        67
          13       0.91      0.94      0.93        54
          14       0.94      0.94      0.94        54
          15       0.89      0.87      0.88        71
          16       0.90      0.90      0.90        42
          17       1.00    

### Confusion matrix of our implimentation

In [49]:
print(confusion_matrix(Y_test,y_pred))

[[41  0  0  1  0  0  0  0  0  0  0  0  0  0  0  3  1  0  0  0]
 [ 0 44  1  4  2  3  1  1  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  7 43  6  2  4  1  0  1  0  0  1  2  0  0  0  0  0  0  0]
 [ 0  0  2 36 11  1  2  3  0  0  0  0  5  0  0  0  0  0  0  0]
 [ 0  1  0  3 46  1  3  2  0  0  0  0  2  0  0  0  0  0  0  0]
 [ 0  9  2  2  1 31  4  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  2  0 52  2  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2 50  2  0  0  1  4  1  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  1 49  0  0  0  2  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  1 63  0  0  0  0  0  1  0  0  0  0]
 [ 0  1  0  1  0  0  1  1  0  3 52  0  0  0  0  0  0  0  0  0]
 [ 0  3  1  0  0  0  0  0  0  0  0 52  0  0  0  1  0  0  1  1]
 [ 0  5  1  4  4  0  1  0  1  1  0  1 49  0  0  0  0  0  0  0]
 [ 0  2  0  0  0  0  0  0  0  0  0  0  0 51  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  1  0  0  0  1 51  0  0  0  0  0]
 [ 0  2  0  0  1  0  1  0  1  0  0  0  1  2  0 62  1  0

### Getting the accuracy score

In [51]:
from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test,y_pred))

0.8215547703180212


## IMPLEMENTATION USING SKLEARN

### Creating a multinomial naive bayes classifier object

In [52]:
from sklearn.naive_bayes import MultinomialNB               
clf = MultinomialNB()                       

### Fitting the training data

In [53]:
clf.fit(updated_X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Updating X_test

In [55]:
updated_X_test=updateXTrain(X_test,features)                
updated_X_test=np.array(updated_X_test)
updated_X_test

array([[3., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

### Calcuating y_pred

In [56]:
y_pred = clf.predict(updated_X_test) 
y_pred

array([ 3, 16, 13, ...,  0, 17,  6])

### Classification report 

In [57]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        46
           1       0.64      0.77      0.70        57
           2       0.84      0.72      0.77        67
           3       0.64      0.58      0.61        60
           4       0.67      0.83      0.74        58
           5       0.76      0.61      0.67        51
           6       0.63      0.86      0.73        58
           7       0.83      0.85      0.84        61
           8       0.82      0.92      0.87        53
           9       0.90      0.96      0.93        68
          10       1.00      0.92      0.96        59
          11       0.98      0.88      0.93        59
          12       0.78      0.73      0.75        67
          13       0.91      0.94      0.93        54
          14       0.95      0.98      0.96        54
          15       0.91      0.86      0.88        71
          16       0.95      0.88      0.91        42
          17       1.00    

### Confusion Matrix

In [58]:
print(confusion_matrix(Y_test,y_pred))

[[40  0  0  1  0  0  0  0  0  0  0  0  0  0  0  3  0  0  1  1]
 [ 0 44  2  4  1  2  1  1  0  0  0  0  2  0  0  0  0  0  0  0]
 [ 0  5 48  3  2  6  2  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  1  3 35 11  0  2  3  0  0  0  0  5  0  0  0  0  0  0  0]
 [ 0  1  0  2 48  1  3  2  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  8  2  3  1 31  4  0  2  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  0  3  3  0 50  1  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2 52  2  0  0  1  3  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  1 49  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0 65  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  1  1  1  2 54  0  0  0  0  0  0  0  0  0]
 [ 0  3  1  0  0  0  0  0  0  0  0 52  0  1  0  0  0  0  2  0]
 [ 0  3  1  4  5  0  2  0  1  1  0  0 49  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  0  0  1 51  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0 53  0  0  0  0  0]
 [ 0  2  0  0  1  0  1  0  1  1  0  0  0  2  0 61  0  0

### Getting the accuracy score

In [59]:
print(accuracy_score(Y_test,y_pred))

0.8268551236749117


# CONCLUSION : 

## on comparing the obove results we can observe that our own implementation of multinomial naive bayes classifier works as good as the inbuilt sklearn function