### Import necessary libraries

In [None]:
import json
from collections import Counter, defaultdict
import itertools
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import math

In [11]:
# Spcify the stemmer and stopwords
porter = PorterStemmer()
stop_words = stopwords.words("english")

# https://www.kaggle.com/rmisra/news-category-dataset
file = "/home/user2/Machine-learning-without-any-libraries/NB/News_Category_Dataset_v2.json"



def clean_data(sentence):
    """clean text data by removing stop words, steming, and removing punctuations
    args: text string
    return : cleaned text dictionary counter"""
    words = [
        porter.stem(word)
        for word in word_tokenize(sentence)
        if word.lower() not in stop_words and word.isalpha()
    ]
    
    return Counter(words)


def get_data(file,train_test_thresh=.9):
    """Read file and get the data for each category with cleaned text in a dictionary format.
        
        args: input file name to read, train_test_split(float<1)
        output: A list of dictioanry key : categry. Value: list of count dictionaries 
        (key : cleaned word, values: count of that word)"""
    
    # Open file and read line by line and append to a list of dictionaries for each record
    data = [json.loads(l) for l in open(file, "r")]
    # We need only news category, headline and short_descriptin for the analysis
    # We'll merge the heading and short_description as text for the category 
    keys_ = ["category", "headline", "short_description"] 
    # filter the data as per the specified keys_
    data = [{k: val for k, val in record.items() if k in keys_} for record in data]
    # get the unique tags
    categories = list(set([record["category"] for record in data]))
    # get data for each category as list
    data_clenced = {tag: [] for tag in categories}
    # update the data_clenced dict
    [
        data_clenced[l["category"]].append(
            clean_data(l["headline"] + " " + l["short_description"])
        )
        for l in data
    ]
    train_data={}
    test_data={}
    for k, v in data_clenced.items():
        train_data[k]=v[: int(len(v)*train_test_thresh)]
        test_data[k]=v[int(len(v)*train_test_thresh):]
    return train_data,test_data



In [32]:

def IDF(corpus, unique_words):
    """ get the idf from the corpus and unique words
    args: corpus(list of list) and unique words
    output: dictionary for each word with idf value"""
    idf_dict={}
    N=len(corpus)
    for i in unique_words:
        count=0
        for sen_list in corpus:
            if i in sen_list:
                count=count+1
            idf_dict[i]=(math.log((1+N)/(count+1)))+1
#     print(idf_dict)
    return idf_dict 

def get_vocab_and_idf(whole_data):
    """get vocabulary and idf of vocabulary
    args: corpus(list of list)
    output: vocabulary(dict key:word, v:index of word), idf_value for
    each word
    """
    unique_words = set()
    if isinstance(whole_data, (list,)):
#         for x in whole_data:
#             for y in x:
#             if len(x)<2:
#                 continue
#             unique_words.add(x)
        unique_words=set(list(itertools.chain(*whole_data)))
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        Idf_values_of_all_unique_words=IDF(whole_data,unique_words)
#         print(Idf_values_of_all_unique_words)
    return vocab, Idf_values_of_all_unique_words


def get_tf():
    """get the tf
    args : none
    output: nested dictionary, key: word(in vocab), values: dictionary(k:category,v:tf_value)"""
    tf_dict=defaultdict(lambda:{tag:0 for tag in train_data.keys()})
    for w in idf_of_vocabulary.keys():
        for tag,v in train_data.items():
            num=0
            freq=0
            for i in v:
                num += len(i)
                freq += i[w]
                
            p = (freq+1)/(num+2)
            tf_dict[w][tag] = p
    return tf_dict



In [13]:
train_data,test_data=get_data(file)

# Get corpus(list of list) all the words in the text in train test
# corpus = list(itertools.chain(*list(itertools.chain(*train_data.values()))))
corpus = list(itertools.chain(*train_data.values()))
# corpus =[]
# [l.extend(list(i.keys())) for i in list(itertools.chain(*test_data.values()))]
Vocabulary, idf_of_vocabulary=get_vocab_and_idf(corpus)

In [33]:
tf_dict=get_tf()


In [34]:
corr = 0
tot =0
for k_ in test_data.keys():
    pro_pred={k:1 for k,_ in test_data.items()}
    for i in test_data[k_]:
        sample_data = i
        for k in test_data.keys():
            for w in sample_data:
                idf = idf_of_vocabulary[w] if w in idf_of_vocabulary else 1/len(idf_of_vocabulary)
#                 tf = tf_dict[w][k] if w in tf_dict else 1/    
                pro_pred[k] += math.log(idf)*tf_dict[w][k]
                if max(pro_pred,key=pro_pred.get)==k_:
                    corr +=1
                tot +=1
#         print(max(pro_pred,key=pro_pred.get),k_)
print(round(corr*100/tot))

85


## Done!