In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import re
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix

In [3]:
dataset=datasets.fetch_20newsgroups()
x=dataset.data
y=dataset.target
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=123)

In [4]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [5]:
import numpy as np
class TextClassification:
    
    def __init__(self):
        self._vocab = {} # For Vocab
        self._mainDict = None # dictionary containing Counts
        
    def fit(self,x_train,y_train):
        self._buildVocab(x_train,y_train) 
        self._fitTrainingData(x_train,y_train)
        
    def _fitTrainingData(self,x_train,y_train):
        for text,cls in zip(x_train,y_train):
            text_arr = text.split()
            text_arr = self._cleanTextArr(text_arr)
            for txt in text_arr:
                if txt in self._topfeat:
                    self._mainDict[cls][txt] = self._mainDict[cls].get(txt,0) + 1            
    
    
    def _buildVocab(self,x_train,y_train):
        
        for text in x_train:
            text_arr = text.split()
            text_arr = self._cleanTextArr(text_arr) # returns array of cleaned data
            for txt in text_arr:
                self._vocab[txt] = self._vocab.get(txt,0) + 1 # updating counts of cleaned data
        self._topFeatures(y_train)
            
    def _cleanTextArr(self,text_arr:list):
        cleaned_arr = []
        for wrd in text_arr:
            wrd = self._formatWord(wrd) # format word for eg: removing special character and number and email id
            if wrd is not None:
                cleaned_arr.append(wrd)
        return cleaned_arr
    
    def _formatWord(self,wrd):
        wrd =wrd.lower()
        special_chr=[".",":",">","<",",", "+", "*", "?", "^", "$", "(", ")", "[", "]", "{", '}', "|","/","-","#",'"',"'","_"]
        for i in special_chr:
            if i in wrd:
                wrd=wrd.replace(i,"")
        pattern="[0-9]+|@|.com"
        check = re.search(pattern,wrd)
        if wrd not in ENGLISH_STOP_WORDS and check is None and len(wrd) > 2 and len(wrd) < 15:
            return wrd
    
    def _topFeatures(self,y_train,percent=0.36): # return given percent of top features from vocab
        features = self._vocab.items()
        features = sorted(features,key=lambda x:x[1],reverse=True)
        n=len(features)
        index = int(percent*n)
        Topfeatures = features[:index]
        self._topfeat = [i for i,j in Topfeatures]
        self._createDict(y_train)
    
    def _createDict(self,y_train): # creates dictionary with counts of words
        self._classes = np.unique(y_train)
        self._mainDict = dict.fromkeys(self._classes,{})
        self._mainDict["Total_Classes"] = len(self._classes)
        for cls in self._classes:
            count = len(y_train[y_train == cls])
            self._mainDict[cls] =dict.fromkeys(self._topfeat,0)
            self._mainDict[cls]["Total_Count"] =  count
    
    def predict(self,x_test): # returns prediction
        y_pred = []
        for x in x_test:
            prediction = self._getSinglePrediction(x)
            y_pred.append(prediction)
        return np.array(y_pred)
    
    def _getSinglePrediction(self,text): # return the class with highest probability 
        text_arr = text.split()
        text_arr = self._cleanTextArr(text_arr)
        best_prob = float('-inf')
        best_class = None
        for cls in self._classes:
            prob = self._calculateProb(cls,text_arr)
            if best_prob < prob:
                best_prob = prob
                best_class = cls
        return best_class
    
    def _calculateProb(self,cls,text_arr:list): # calculates probability with laplace correction 
        probability = np.log(self._mainDict[cls]['Total_Count']) - np.log(self._mainDict["Total_Classes"])
        for wrd in text_arr:
            if wrd in self._topfeat:
                num = self._mainDict[cls][wrd] + 1
                den = self._mainDict[cls]["Total_Count"] + len(self._mainDict[cls].keys())
                
                prob = np.log(num) - np.log(den)
                
                probability += prob
        return probability

In [6]:
clf = TextClassification()
clf.fit(x_train,y_train)

In [7]:
y_pred=clf.predict(x_test)

In [8]:
score = np.sum(y_test == y_pred) / len(y_test)
print(f"Accuracy of Model is {score*100:.2f}%")

Accuracy of Model is 81.27%


In [9]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       101
           1       0.68      0.83      0.75       136
           2       0.88      0.52      0.65       150
           3       0.54      0.80      0.64       124
           4       0.98      0.60      0.75       154
           5       0.71      0.87      0.78       158
           6       0.90      0.59      0.72       140
           7       0.96      0.79      0.87       155
           8       0.95      0.87      0.91       139
           9       0.99      0.82      0.90       157
          10       0.86      0.99      0.92       136
          11       0.88      0.94      0.91       176
          12       0.94      0.67      0.79       153
          13       0.82      0.95      0.88       134
          14       0.83      0.96      0.89       157
          15       0.85      0.92      0.89       171
          16       0.84      0.92      0.88       140
          17       0.61    