In [1]:
from collections import defaultdict
import math
import json
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [2]:
porter = PorterStemmer()
stop_words = stopwords.words("english")

# https://www.kaggle.com/rmisra/news-category-dataset
file = "/home/user2/Machine-learning-without-any-libraries/NB/News_Category_Dataset_v2.json"


def clean_data(sentence):
    """clean text string by removing stop words, stteming,punctuations
    args: text string
    return: cleaned text list"""
    words = [porter.stem(word) for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalpha()]
    return words
    
def get_data(file,train_test_thresh=0.9):
    """Read data in a json file and get the data for each category with cleaned text in dictionary with train and
    test set
    key: tag, value:list of lists with cleaned words
    input: json file, train_test_thresh
    output: dictionary (key: tag, value:list of lists with cleaned words)
    ex: {tag1:[[word1,word2...],[word1,word2...]]}
    """
    data = [json.loads(x) for x in open(file,'r')]
    #get only relevant data
    keys_  = ['category','headline','short_description']
    data = [{k:val for k,val in record.items() if k in keys_} for record in data]
    # now get data for each category as a list of list
    category = list(set([record['category'] for record in data]))
    data_clensed = {tag:[] for tag in category}
    for record in data:
        data_clensed[record['category']].append(clean_data(record['headline']+" "+record['short_description']))
        
    train_data= {}
    test_data = {}
    for k,v in data_clensed.items():
        train_data[k] = v[: int(len(v)*train_test_thresh)]
        test_data[k] = v[int(len(v)*train_test_thresh):]
    return train_data,test_data


In [3]:
train_data,test_data = get_data(file,train_test_thresh=0.9)

In [4]:
class NBMultinormial:
    def __init__(self,train_data):
        self.laplace_smoothing_factor = 1
        self.prior_per_tag={}
        self.likelihood_per_word_per_tag = {}
        self.tags = train_data.keys()
        self.train_data = train_data
        self.train()
        
    def train(self):
        # get the priors per tag
        tag_count_map = {tag:len(self.train_data[tag]) for tag in self.tags}
        self.priors_per_tag = {tag:tag_count_map[tag]/sum(tag_count_map.values()) for tag in self.tags}
        self.likelihood_per_word_per_tag = self.get_word_likelihood()
        
    def get_word_likelihood(self):
        """calculate the probability of each word to tag
           word count of a word in a tag / total_words in that tag with laplace smoothing
           returns a dictionary
           ex: {word:{tag1:proba,tag2:proba,...}}"""
        # we need word count per tag and total word for frequencies as well
        word_frequencies_per_tag = defaultdict(lambda:{tag:0 for tag in self.tags})
        word_likelihood_per_tag = defaultdict(lambda:{tag:0.5 for tag in self.tags})
        total_word_count_per_tag = defaultdict(int)
        for tag in self.tags:
            for article in self.train_data[tag]:
                for word in article:
                    word_frequencies_per_tag[word][tag] +=1
                    total_word_count_per_tag[tag] +=1
        
        for word,tag_map in word_frequencies_per_tag.items():
            for tag in tag_map.keys():
                word_likelihood_per_tag[word][tag] =(word_frequencies_per_tag[word][tag]+1*self.laplace_smoothing_factor)/(total_word_count_per_tag[tag]+2*self.laplace_smoothing_factor)
        return word_likelihood_per_tag
                                                                                                                           
    def predict(self,article):
        """process the each word in sentence and get the posterior probability for each tag for that sentence
           get the sum of log value  to get rid of getting lower values
           posterior = prior*likelihood ( we ignore the normaliation constant)
        """
        posteriors_per_tag = {tag:math.log(prior) for tag,prior in self.priors_per_tag.items()}
        for word in article:
            for tag in self.tags:
                posteriors_per_tag[tag] +=math.log(self.likelihood_per_word_per_tag[word][tag])
        return posteriors_per_tag
                                                                                                                                                                                                                                                                                                                                                                       
                                                                                                           

In [5]:
model = NBMultinormial(train_data)

In [7]:
corr =0
tot =0
for key,articles in test_data.items():
    for article in articles:
        predictions = model.predict(article)
        pred_val=max(predictions,key=predictions.get)
        if pred_val==key:
            corr +=1
        tot +=1
print(round(corr*100/tot))
            

41
