### Question 1

In [1]:
import numpy as np 
import pandas as pd 
import math	
#import matplotlib.pyplot as plt 

In [2]:
def accuracy_score(y_true, y_pred):
	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)
	
def pre_processing(df):
	X = df.drop([df.columns[-1]], axis = 1)
	y = df[df.columns[-1]]

	return X, y

In [None]:
class  NaiveBayes:

	def __init__(self):
		self.features = list
		self.likelihoods = {}
		self.class_priors = {}
		self.pred_priors = {}

		self.X_train = np.array
		self.y_train = np.array
		self.train_size = int
		self.num_feats = int

	def fit(self, X, y):

		self.features = list(X.columns)
		self.X_train = X
		self.y_train = y
		self.train_size = X.shape[0]
		self.num_feats = X.shape[1]

		for feature in self.features:
			self.likelihoods[feature] = {}
			self.pred_priors[feature] = {}

			for feat_val in np.unique(self.X_train[feature]):
				self.pred_priors[feature].update({feat_val: 0})

				for outcome in np.unique(self.y_train):
					self.likelihoods[feature].update({feat_val+'_'+outcome:0})
					self.class_priors.update({outcome: 0})

		self._calc_class_prior()
		self._calc_likelihoods()
		self._calc_predictor_prior()

		
	def _calc_class_prior(self):

		for outcome in np.unique(self.y_train):
			outcome_count = sum(self.y_train == outcome)
			self.class_priors[outcome] = outcome_count / self.train_size

	def _calc_likelihoods(self):

		for feature in self.features:

			for outcome in np.unique(self.y_train):
				outcome_count = sum(self.y_train == outcome)
				feat_likelihood = self.X_train[feature][self.y_train[self.y_train == outcome].index.values.tolist()].value_counts().to_dict()

				for feat_val, count in feat_likelihood.items():
					self.likelihoods[feature][feat_val + '_' + outcome] = count/outcome_count


	def _calc_predictor_prior(self):

			for feature in self.features:
				feat_vals = self.X_train[feature].value_counts().to_dict()

				for feat_val, count in feat_vals.items():
					self.pred_priors[feature][feat_val] = count/self.train_size

In [None]:
	def predict(self, X):

		""" Calculates Posterior probability P(c|x) """

		results = []
		X = np.array(X)

		for query in X:
			probs_outcome = {}
			for outcome in np.unique(self.y_train):
				prior = self.class_priors[outcome]
				likelihood = 1
				evidence = 1

				for feat, feat_val in zip(self.features, query):
					likelihood *= self.likelihoods[feat][feat_val + '_' + outcome]
					evidence *= self.pred_priors[feat][feat_val]

				posterior = (likelihood * prior) / (evidence)

				probs_outcome[outcome] = posterior

			result = max(probs_outcome, key = lambda x: probs_outcome[x])
			results.append(result)
			
		print(results,"\n")
		return np.array(results)

In [None]:
if __name__ == "__main__":

	#Weather Dataset
	print("\nWeather Dataset:")

	df = pd.read_csv(r'Book1.csv')
	print(df)

	#Split fearures and target
	X,y  = pre_processing(df)

	nb_clf = NaiveBayes()
	nb_clf.fit(X, y)

	print("Train Accuracy: {}".format(accuracy_score(y, nb_clf.predict(X))))
	
	#Query 1:
	query = np.array([['Serving', 'good', 'Food', 'absolutely', 'perfect', 'Restaurant']])
	print("Query 1:- {} ---> {}".format(query, nb_clf.predict(query)))

### Question 2

In [1]:
import numpy as np 
import pandas as pd 
from csv import reader

In [2]:
data = pd.read_csv(r'lab6.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   TDP       7 non-null      int64 
 1   Nifty     7 non-null      int64 
 2   Sidhu     7 non-null      int64 
 3   BJP       7 non-null      int64 
 4   Sensex    7 non-null      int64 
 5   Sixer     7 non-null      int64 
 6   Congress  7 non-null      int64 
 7   Century   7 non-null      int64 
 8   Category  7 non-null      object
dtypes: int64(8), object(1)
memory usage: 632.0+ bytes


In [4]:
data.columns

Index(['TDP', 'Nifty', 'Sidhu', 'BJP', 'Sensex', 'Sixer', 'Congress',
       'Century', 'Category'],
      dtype='object')

In [5]:
data.head(10)

Unnamed: 0,TDP,Nifty,Sidhu,BJP,Sensex,Sixer,Congress,Century,Category
0,4,0,3,5,1,0,6,0,Politics
1,0,5,0,2,6,0,1,0,Business
2,0,0,6,1,0,4,1,2,Sports
3,4,1,0,1,1,0,6,0,Politics
4,0,0,0,0,0,5,0,6,Sports
5,0,4,0,2,6,0,0,1,Business
6,5,0,0,3,0,0,5,0,Politics


In [6]:
query_data= [[4,0,2,0,1,0,6,0],[0,0,2,0,0,9,0,9],[5,0,2,5,0,9,0,9]]

In [7]:
outputlabels = data['Category'].unique()
words = list(data.columns)[:-1]
numtraindocuments = data.shape[0]

In [8]:
print(outputlabels)
print(words)
print(numtraindocuments)

['Politics' 'Business' 'Sports']
['TDP', 'Nifty', 'Sidhu', 'BJP', 'Sensex', 'Sixer', 'Congress', 'Century']
7


In [9]:
conditional_probability = {}
probability = {}

In [10]:
for outputClass in outputlabels:
    temp_dataframe = data.loc[data['Category']==outputClass]
    probability[outputClass]=(temp_dataframe.shape[0]/numtraindocuments)

In [11]:
print(probability)

{'Politics': 0.42857142857142855, 'Business': 0.2857142857142857, 'Sports': 0.2857142857142857}


In [12]:
ALPHA = 1

In [13]:
for outputClass in outputlabels:
    temp_dataframe = data.loc[data['Category']==outputClass]
    total_word_count_in_category =0
    for i in range(temp_dataframe.shape[0]):
        for word in words:
            total_word_count_in_category += temp_dataframe.iloc[i][word]
    for word in words:
        current_word_count_in_category =0
        for i in range(temp_dataframe.shape[0]):
            current_word_count_in_category += temp_dataframe.iloc[i][word]
            cur_prob = (current_word_count_in_category + ALPHA) / (total_word_count_in_category)
            conditional_probability[(word, outputClass)] = cur_prob

In [14]:
print("Conditional probability after applying smoothing\n")
conditional_probability

Conditional probability after applying smoothing



{('TDP', 'Politics'): 0.3111111111111111,
 ('Nifty', 'Politics'): 0.044444444444444446,
 ('Sidhu', 'Politics'): 0.08888888888888889,
 ('BJP', 'Politics'): 0.2222222222222222,
 ('Sensex', 'Politics'): 0.06666666666666667,
 ('Sixer', 'Politics'): 0.022222222222222223,
 ('Congress', 'Politics'): 0.4,
 ('Century', 'Politics'): 0.022222222222222223,
 ('TDP', 'Business'): 0.037037037037037035,
 ('Nifty', 'Business'): 0.37037037037037035,
 ('Sidhu', 'Business'): 0.037037037037037035,
 ('BJP', 'Business'): 0.18518518518518517,
 ('Sensex', 'Business'): 0.48148148148148145,
 ('Sixer', 'Business'): 0.037037037037037035,
 ('Congress', 'Business'): 0.07407407407407407,
 ('Century', 'Business'): 0.07407407407407407,
 ('TDP', 'Sports'): 0.04,
 ('Nifty', 'Sports'): 0.04,
 ('Sidhu', 'Sports'): 0.28,
 ('BJP', 'Sports'): 0.08,
 ('Sensex', 'Sports'): 0.04,
 ('Sixer', 'Sports'): 0.4,
 ('Congress', 'Sports'): 0.08,
 ('Century', 'Sports'): 0.36}

In [15]:
query_dict = {}
list_query_dict = []
for data in query_data:
    for i, word in enumerate(words) :
        query_dict[word] = data[i]
    list_query_dict.append(query_dict)
    query_dict = {}

In [16]:
list_query_dict

[{'TDP': 4,
  'Nifty': 0,
  'Sidhu': 2,
  'BJP': 0,
  'Sensex': 1,
  'Sixer': 0,
  'Congress': 6,
  'Century': 0},
 {'TDP': 0,
  'Nifty': 0,
  'Sidhu': 2,
  'BJP': 0,
  'Sensex': 0,
  'Sixer': 9,
  'Congress': 0,
  'Century': 9},
 {'TDP': 5,
  'Nifty': 0,
  'Sidhu': 2,
  'BJP': 5,
  'Sensex': 0,
  'Sixer': 9,
  'Congress': 0,
  'Century': 9}]

In [17]:
categorical_result_probability = {}
result_probability =[]
for query_dict in list_query_dict:
    for output_class in outputlabels :
        cur_prob = 1
        for word in words :
            cur_prob *= (conditional_probability[(word, output_class)] ** query_dict[word])
            categorical_result_probability[output_class] = cur_prob                  
    result_probability.append(categorical_result_probability)
    categorical_result_probability = {}

In [18]:
result_probability

[{'Politics': 2.0212765225616147e-08,
  'Business': 2.0530257296565003e-16,
  'Sports': 2.1045339750400004e-15},
 {'Politics': 1.3799701971155752e-32,
  'Business': 1.2077990336473961e-26,
  'Sports': 2.0872693292214035e-09},
 {'Politics': 2.1796398918917e-38,
  'Business': 1.8331882202741092e-37,
  'Sports': 7.003713677304523e-22}]

In [19]:
i=1
for categorical_result_probability in result_probability:
    result_category = max(categorical_result_probability, key=categorical_result_probability.get)
    result_score = categorical_result_probability[result_category]
    print(f"The query {i} entered belongs to the category : {result_category}")
    i=i+1

The query 1 entered belongs to the category : Politics
The query 2 entered belongs to the category : Sports
The query 3 entered belongs to the category : Sports
