In [2]:
import os
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import numpy as np
import operator
import pandas as pd
from string import punctuation
from tqdm import tqdm
from sklearn.metrics import classification_report,confusion_matrix
#Downloads stopwords which can be avoided during data pre-processing stage
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
additional_stop_words = ["why's", '5th', 'subject:', 'from:', 'newsgroups:', 'lines:', 'believe', 'references:', 
                         'writes:', '3rd', "she'd", '8th', 'hundred', 'world', "he'll", 'university', '9th', '1st', 
                         'ours6th', 'ten', "when's", 'stillwould', 'message-id:', '4th', 'thousand', 
                         'nntp-posting-host:', 'article', 'organization:', 'cantaloupe.srv.cs.cmu.edu', '7th', 
                         "10thi've", 'date:', "he'd", '2nd', 'sender:', 'distribution:', "how's", 'path:', "she'll", 
                         'xref:']

In [4]:
def load_data(folder_path):
    folders = os.listdir(folder_path)
    data_dict = {}
    news_groups_dict = {}
    for folder in folders:
        if not folder.startswith('.'):
            data_dict[folder] = []
            sub_folder_path = os.path.join(folder_path, folder)
            news_groups_dict[folder] = sub_folder_path
            for file in os.listdir(sub_folder_path):
                with open(os.path.join(sub_folder_path, file), encoding='latin-1') as file_stream:
                    data_dict[folder].append(file_stream.read())
                    file_stream.close()
    return data_dict, news_groups_dict
    
    
def load_corpus():
    stopwords_list = set(stopwords.words('english'))
    stopwords_list.update(list(punctuation))
    stopwords_list.update(additional_stop_words)
    return stopwords_list
    
    
def get_word_frequency(newsdata, stopwords_list, word_frequency_dict):
    file_frequency_dict = {}
    for idx, doc in enumerate(newsdata):
        if idx not in file_frequency_dict:
            file_frequency_dict[idx] = {}
        for words in doc.split():
            words = process_words(words, stopwords_list)
            if words != []:
                word_frequency_dict[words] = word_frequency_dict.get(words, 1) + 1
                file_frequency_dict[idx][words] = file_frequency_dict.get(idx).get(words, 1) + 1
    return word_frequency_dict, file_frequency_dict
    
    
def process_words(data, stopwords_list):
    lowercase_data = data.lower()
    if lowercase_data not in stopwords_list:
        data = data
        return lowercase_data
    else:
        return []
    
    
def get_dataframe(news_groups_dict, data_dict, features, file_dict):
    newsgroups_dataframe = pd.DataFrame(columns = features)
    for folder, data in tqdm(data_dict.items()):
        if folder.startswith('.'):
            continue
        sub_folder_path = news_groups_dict[folder]
        for idx, file in enumerate(data_dict[folder]):
            newsgroups_dataframe.loc[len(newsgroups_dataframe)] = [file_dict[folder][idx].get(x, 1) 
                                                                   for x in features]
    return newsgroups_dataframe

In [6]:
data_path = 'C:/Users/samee/Downloads/bow-daraset/20_newsgroups'
data_dict, news_groups_dict = load_data(data_path)
stopwords_list = load_corpus()
wf_dict = {}
file_dict = {}

for group, data in data_dict.items():
    wf_dict, file_dict_temp = get_word_frequency(data, stopwords_list, wf_dict)
    file_dict[group] = file_dict_temp

wf_list = (sorted(wf_dict.items(), key=operator.itemgetter(1), reverse=True))
features_list = [word[0] for word in wf_list][:1000]
class_list = list(file_dict.keys())

In [7]:
wf_data_df=get_dataframe(news_groups_dict, data_dict, features_list, file_dict)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [13:13<00:00, 39.69s/it]


In [8]:
class NaiveBayesModel:
    
    def __init__(self, data_df, class_list, features, file_dict):
        self._model = {}
        self.data_df = data_df
        self.class_list = class_list
        self.features = features
        self.file_dict = file_dict
        self.n_classes = len(self.class_list)
        self.n_features = len(self.features)
        self.class_list = class_list
        self.flag_fit = False
        
    def fit(self, test_size=0.3):
        self.X = self.data_df.values
        list_labels = []
        for group, data in self.file_dict.items():
            list_labels.extend([group]*len(data))
        self.y = np.array(list_labels)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, 
                                                                                self.y, 
                                                                                test_size=
                                                                                test_size)
        self._model['Number_of_classes'] = len(self.y_train)
        for label in self.class_list:
            total_words = 1
            self._model[label] = {}
            current_rows = self.y_train == label
            X_train_current = self.X_train[current_rows]
            for feauture in range(len(self.features)):
                sumoffeatures = X_train_current[:, feauture].sum()
                self._model[label][self.features[feauture]] = sumoffeatures
                total_words += sumoffeatures
            self._model[label]["Number_of_classes"] = total_words
        print('Naive Bayes Model fitting done.')
        self.flag_fit = True
        return None
    
    def _calc_probability_(self, word, classgroup):
        if not self.flag_fit:
            self.fit()
        class_count = self._model[classgroup]["Number_of_classes"]
        log_class_count = np.log(class_count)
        total_count = self._model["Number_of_classes"]
        log_total_count = np.log(total_count)
        current_out = log_class_count - log_total_count
        for idx in range(len(self.features)):
            current_word_count = self._model[classgroup][self.features[idx]] + 1
            log_current_word_count = np.log(current_word_count)
            total_word_count=  self._model[classgroup]["Number_of_classes"] + len(self.features)
            log_total_word_count = np.log(total_word_count)
            curr_probability = log_current_word_count-log_total_word_count
            for _ in range(int(word[idx])):
                current_out = curr_probability + current_out
        return current_out
    
    def __predict__(self, word):
        prev_probability = -30000000
        prev_class = -300000
        for current in self.file_dict.keys():
            if current == 'Number_of_classes':
                continue
            curr_probability = self._calc_probability_(word, current)
            if(curr_probability > prev_probability):
                prev_class = current
                prev_probability = curr_probability
            is_first_exec = False
        return prev_class
    
    def predict_batch(self, word_list=None):
        if not self.flag_fit:
            self.fit()
        if word_list is None:
            word_list = self.X_test
        class_predicted = []
        for curr_word in tqdm(word_list):
            class_predicted.append(self.__predict__(curr_word))
        return class_predicted

In [9]:
model = NaiveBayesModel(wf_data_df, class_list, features_list, file_dict)

In [10]:
model.fit(test_size=0.4)

Naive Bayes Model fitting done.


In [11]:
class_predicted=model.predict_batch()

100%|██████████████████████████████████████████████████████████████████████████████| 7999/7999 [17:37<00:00,  7.56it/s]


In [12]:
print(confusion_matrix(class_predicted,model.y_test))
print(classification_report(class_predicted,model.y_test))

[[150   7   3   1   5   8   1   3   6   8   5   1   5  10   4   3   0  12
    6  49]
 [  0 167  34   7   0  54   0   1   0   0   0   3   6   4   8   0   0   0
    1   0]
 [  0   0  40   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   5  33 239  32   5  18   2   0   0   1   2   3   0   1   0   0   1
    1   0]
 [  2  15  55  15 327  20   7   2   0   0   0   5   2   0   2   0   0   0
    3   3]
 [  0   4   2   3   0  78   0   0   0   0   0   1   1   0   0   0   0   0
    0   0]
 [ 24  77  93  61  17  82 336  24  24  45  72  21  20  38  24  39  35  36
   45  34]
 [  8   5   4   6   1   1   8 294   5   2   5   2   5   5   8   0  10   8
    9   7]
 [ 57  12  14   5   2   9   3  12 330   8  58  32   3   9  15   5  39  52
   48  35]
 [  1   0   1   2   0   0   2   0   0 297 154   0   0   0   2   0   0   0
    2   1]
 [  0   4   1   0   1   1   6   0   0  13  43   1   4   1   2   0   0   0
    0   0]
 [  3   1   7   1   0   8   1   1   1   1   0 227   2   0   1   1