In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('traindata.csv')
#Creating a list of sentences
sentences = dataset.iloc[:,0:2].values


In [3]:
#Removing punctuations from each word
import string
table = str.maketrans('', '', string.punctuation)
bag_of_words = []
for sentence in sentences[:, 1]:
    words = [word.translate(table) for word in sentence.split()]
    bag_of_words.append(words)
print('All the words are now free from punctuation marks!!!')

All the words are now free from punctuation marks!!!


In [4]:
#Coverting all the words to lower case
bag_of_words = [[word.lower() for word in example] for example in bag_of_words]
print("All the words have been converted to lower case!!!")

All the words have been converted to lower case!!!


In [5]:
#Removing stopwords from the file
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
bag_of_words = [[word for word in example if word not in stop_words] for example in bag_of_words]
print("All the stop-words have been removed!!!")

All the stop-words have been removed!!!


[nltk_data] Downloading package stopwords to C:\Users\TEJAS
[nltk_data]     POTE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
bag_of_words = [[ps.stem(word) for word in example] for example in bag_of_words]

In [7]:
#Creating a dictionary of words for each news category
covid = {}
science = {}
business = {}
sports = {}

#Keeping a count of no of training examples in each category
n_covid = 0
n_science = 0
n_business = 0
n_sports = 0
for i in range(sentences.shape[0]):
    if sentences[i,0] == 'covid':
        n_covid += 1
        words = bag_of_words[i]
        for word in words:
            if word not in covid.keys():
                covid.update({word:1})
            else:
                covid[word]+=1
    elif sentences[i,0] == 'science':
        n_science += 1
        words = bag_of_words[i]
        for word in words:
            if word not in science.keys():
                science.update({word:1})
            else:
                science[word]+=1
    elif sentences[i,0] == 'business':
        n_business += 1
        words = bag_of_words[i]
        for word in words:
            if word not in business.keys():
                business.update({word:1})
            else:
                business[word]+=1
    elif sentences[i,0] == 'sports':
        n_sports += 1
        words = bag_of_words[i]
        for word in words:
            if word not in sports.keys():
                sports.update({word:1})
            else:
                sports[word]+=1
print('The dictionary of words for each class was created')

The dictionary of words for each class was created


In [8]:
#Calculating the prior probability
pp_covid = n_covid/sentences.shape[0]
pp_science = n_science/sentences.shape[0]
pp_sports = n_sports/sentences.shape[0]
pp_business = n_business/sentences.shape[0]

print('The prior probabilities were evaluated')

The prior probabilities were evaluated


In [9]:
#Implementing Naive Bayes using our vocabulary on the test set
test_data = pd.read_csv('testdata.csv')
sample_sent = test_data.iloc[:, 0:2].values

class_cond_prob = []
post_dist_list = []
predicted_label = []
labels = {0:'covid', 1:'sports', 2:'business', 3:'science'}

sample_sent[:, 1] = [sentence.split() for sentence in sample_sent[:, 1]]
sample_sent[:, 1] = [[word.translate(table) for word in example] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[word.lower() for word in example] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[word for word in example if word not in stop_words] for example in sample_sent[:, 1]]
sample_sent[:, 1] = [[ps.stem(word) for word in example] for example in sample_sent[:, 1]]
for i in range(sample_sent.shape[0]):
    p_covid = 1
    p_sports = 1
    p_science = 1
    p_business = 1
    words = sample_sent[i,1]
    for word in words:
        if word in covid.keys():
            p_covid *= covid[word]/sum(covid.values())
        else:
            p_covid *= 0.001
        if word in sports.keys():
            p_sports *= sports[word]/sum(sports.values()) 
        else:
            p_sports *= 0.001
        if word in business.keys():
            p_business *= business[word]/sum(business.values()) 
        else:
            p_business *= 0.001
        if word in science.keys():
            p_science *= science[word]/sum(science.values())
        else:
            p_science *= 0.001
    class_cond_prob.append([p_covid, p_sports, p_business, p_science])
    post_dist = [p_covid*pp_covid, p_sports*pp_sports, p_business*pp_business, p_science*pp_science]
    post_dist_list.append(post_dist)
    max_prob = max(post_dist)
    index = post_dist.index(max_prob)
    predicted_label.append(labels[index])
correct_pred = 0
for i in range(len(predicted_label)):
    if predicted_label[i] == sample_sent[i,0]:
        correct_pred += 1
test_data['Predicted Label'] = predicted_label
print('Accuracy of Prediction : {}%'.format((correct_pred/len(predicted_label))*100))

Accuracy of Prediction : 100.0%
