Write a program to implement the naïve Bayesian classifier for a sample training data set stored 
as a .CSV file. Compute the accuracy of the classifier, considering few test data sets. 

In [16]:
# import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score

# Load Data from CSV
dataset = pd.read_csv('ConceptLearning.csv',header=None)
X=dataset.iloc[:, :-1]
y=dataset.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size = 0.15)

classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
print("Accuracy is:", accuracy_score(y_test,y_pred))

Confusion Matrix:
 [[0 0]
 [2 1]]
Accuracy is: 0.3333333333333333


In [17]:
#Write a program to implement the naïve Bayesian classifier for a sample training data set
#stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.
print("\nNaive Bayes Classifier for concept learning problem")
import csv
import math

def safe_div(x,y):
 if y == 0:
  return 0
 return x/y

# 1.Data Handling
# 1.1 Loading the Data from csv file of ConceptLearning dataset.
def loadCsv(filename):
  lines = csv.reader(open(filename))
  dataset = list(lines)
  for i in range(len(dataset)):
    dataset[i] = [float(x) for x in dataset[i]]
  return dataset


#1.2 Splitting the Data set into Training Set
def splitDataset(dataset, splitRatio):
  trainSize = int(len(dataset) * splitRatio)
  trainSet = []
  copy = list(dataset)
  i=0
  while len(trainSet) < trainSize:
  #index = random.randrange(len(copy))
    trainSet.append(copy.pop(i))
  return [trainSet, copy]


#2.Summarize Data
#The naive bayes model is comprised of a 
#summary of the data in the training dataset. 
#This summary is then used when making predictions.
#involves the mean and the standard deviation for each attribute, by class value

#2.1: Separate Data By Class
#Function to categorize the dataset in terms of classes 
#The function assumes that the last attribute (-1) is the class value. 
#The function returns a map of class values to lists of data instances.

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

#The mean is the central middle or central tendency of the data, 
# and we will use it as the middle of our gaussian distribution 
# when calculating probabilities

#2.2 : Calculate Mean
def mean(numbers):
  return safe_div(sum(numbers),float(len(numbers)))

#The standard deviation describes the variation of spread of the data, 
#and we will use it to characterize the expected spread of each attribute
#in our Gaussian distribution when calculating probabilities.

#2.3 : Calculate Standard Deviation
def stdev(numbers):
  avg = mean(numbers)
  variance = safe_div(sum([pow(x-avg,2) for x in numbers]),float(len(numbers)-1))
  return math.sqrt(variance)

#2.4 : Summarize Dataset
#Summarize Data Set for a list of instances (for a class value) 
#The zip function groups the values for each attribute across our data instances 
#into their own lists so that we can compute the mean and standard deviation values 
#for the attribute. 

def summarize(dataset):
  summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
  del summaries[-1]
  return summaries

#2.5 : Summarize Attributes By Class
#We can pull it all together by first separating our training dataset into 
#instances grouped by class.Then calculate the summaries for each attribute.

def summarizeByClass(dataset):
  separated = separateByClass(dataset)
  summaries = {}
  for classValue, instances in separated.items():
    summaries[classValue] = summarize(instances)
  print("Summarize Attributes By Class\n",summaries)
  print(" ")
  return summaries

#3.Make Prediction
#3.1 Calculate Probaility Density Function
def calculateProbability(x, mean, stdev):
  exponent = math.exp(-safe_div(math.pow(x-mean,2),(2*math.pow(stdev,2))))
  final = safe_div(1 , (math.sqrt(2*math.pi) * stdev)) * exponent
  return final

#3.2 Calculate Class Probabilities
def calculateClassProbabilities(summaries, inputVector):
  probabilities = {}
  for classValue, classSummaries in summaries.items():
   probabilities[classValue] = 1
  for i in range(len(classSummaries)):
    mean, stdev = classSummaries[i]
    x = inputVector[i]
    probabilities[classValue] *= calculateProbability(x, mean, stdev)
  return probabilities

#3.3 Prediction : look for the largest probability and return the associated class
def predict(summaries, inputVector):
  probabilities = calculateClassProbabilities(summaries, inputVector)
  bestLabel, bestProb = None, -1
  for classValue, probability in probabilities.items():
    if bestLabel is None or probability > bestProb:
      bestProb = probability
      bestLabel = classValue
  return bestLabel

#4.Make Predictions
# Function which return predictions for list of predictions
# For each instance
def getPredictions(summaries, testSet):
  predictions = []
  for i in range(len(testSet)):
    result = predict(summaries, testSet[i])
    predictions.append(result)
  return predictions

#5. Computing Accuracy 
def getAccuracy(testSet, predictions):
  correct = 0
  for i in range(len(testSet)):
    if testSet[i][-1] == predictions[i]:
      correct += 1
  accuracy = safe_div(correct,float(len(testSet))) * 100.0
  return accuracy

def main():
  filename = 'ConceptLearning.csv'
  splitRatio = 0.9
  dataset = loadCsv(filename)
  trainingSet, testSet = splitDataset(dataset, splitRatio)
  print('Split {0} rows into'.format(len(dataset)))
  print('Number of Training data: ' + (repr(len(trainingSet))))
  print('Number of Test Data: ' + (repr(len(testSet))))
  print("\nThe values assumed for the concept learning attributes are\n")
  print("OUTLOOK=> Sunny=1 Overcast=2 Rain=3\nTEMPERATURE=> Hot=1 Mild=2 Cool=3\nHUMIDITY=> High=1 Normal=2\nWIND=> Weak=1 Strong=2")
  print("TARGET CONCEPT:PLAY TENNIS=> Yes=10 No=5")
  print("\nThe Training set are:")
  for x in trainingSet:
    print(x)
  print("\nThe Test data set are:")
  for x in testSet:
    print(x)
  print("\n")

# prepare model
  summaries = summarizeByClass(trainingSet)

# test model
  predictions = getPredictions(summaries, testSet)
  actual = []
  for i in range(len(testSet)):
   vector = testSet[i]
  actual.append(vector[-1])

# Since there are five attribute values, each attribute constitutes to 20% accuracy. So if all attributes
#match with predictions then 100% accuracy
  print('Actual values: {0}%'.format(actual))
  print('Predictions: {0}%'.format(predictions))
  accuracy = getAccuracy(testSet, predictions)
  print('Accuracy: {0}%'.format(accuracy))

main()



Naive Bayes Classifier for concept learning problem
Split 14 rows into
Number of Training data: 12
Number of Test Data: 2

The values assumed for the concept learning attributes are

OUTLOOK=> Sunny=1 Overcast=2 Rain=3
TEMPERATURE=> Hot=1 Mild=2 Cool=3
HUMIDITY=> High=1 Normal=2
WIND=> Weak=1 Strong=2
TARGET CONCEPT:PLAY TENNIS=> Yes=10 No=5

The Training set are:
[1.0, 1.0, 1.0, 1.0, 5.0]
[1.0, 1.0, 1.0, 2.0, 5.0]
[2.0, 1.0, 1.0, 1.0, 10.0]
[3.0, 2.0, 1.0, 1.0, 10.0]
[3.0, 3.0, 2.0, 1.0, 10.0]
[3.0, 3.0, 2.0, 2.0, 5.0]
[2.0, 3.0, 2.0, 2.0, 10.0]
[1.0, 2.0, 1.0, 1.0, 5.0]
[1.0, 3.0, 2.0, 1.0, 10.0]
[3.0, 2.0, 2.0, 1.0, 10.0]
[1.0, 2.0, 2.0, 2.0, 10.0]
[2.0, 2.0, 1.0, 2.0, 10.0]

The Test data set are:
[2.0, 1.0, 2.0, 1.0, 10.0]
[3.0, 2.0, 1.0, 2.0, 5.0]


Summarize Attributes By Class
 {5.0: [(1.5, 1.0), (1.75, 0.9574271077563381), (1.25, 0.5), (1.5, 0.5773502691896257)], 10.0: [(2.125, 0.8345229603962802), (2.25, 0.7071067811865476), (1.625, 0.5175491695067657), (1.375, 0.51754916950

In [18]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

dataset=pd.read_csv("traintennis.csv")

dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [19]:
x=dataset.iloc[:,:-1]
y=dataset.iloc[:,-1]

In [20]:
print(x)

     Outlook Temperature Humidity    Wind
0      Sunny         Hot     High    Weak
1      Sunny         Hot     High  Strong
2   Overcast         Hot     High    Weak
3       Rain        Mild     High    Weak
4       Rain        Cool   Normal    Weak
5       Rain        Cool   Normal  Strong
6   Overcast        Cool   Normal  Strong
7      Sunny        Mild     High    Weak
8      Sunny        Cool   Normal    Weak
9       Rain        Mild   Normal    Weak
10     Sunny        Mild   Normal  Strong
11  Overcast        Mild     High  Strong
12  Overcast         Hot   Normal    Weak
13      Rain        Mild     High  Strong


In [21]:
print(y)

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PlayTennis, dtype: object


In [None]:
le_outlook=LabelEncoder()
x.Outlook=le_outlook.fit_transform(x.Outlook)

In [23]:
le_temp=LabelEncoder()
x.Temperature=le_temp.fit_transform(x.Temperature)

In [24]:
le_humidity=LabelEncoder()
x.Humidity=le_humidity.fit_transform(x.Humidity)

In [25]:
le_wind=LabelEncoder()
x.Wind=le_wind.fit_transform(x.Wind)

In [26]:
le_play=LabelEncoder()
y=le_play.fit_transform(y)

In [27]:
print(x)

    Outlook  Temperature  Humidity  Wind
0         2            1         0     1
1         2            1         0     0
2         0            1         0     1
3         1            2         0     1
4         1            0         1     1
5         1            0         1     0
6         0            0         1     0
7         2            2         0     1
8         2            0         1     1
9         1            2         1     1
10        2            2         1     0
11        0            2         0     0
12        0            1         1     1
13        1            2         0     0


In [28]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.20)

nb=GaussianNB()
nb.fit(x_train,y_train)
y_pred=nb.predict(x_test)

print("confusion Matrix\n",confusion_matrix(y_test,y_pred))
print("Accuracy Score",accuracy_score(y_test,y_pred))

confusion Matrix
 [[0 0]
 [2 1]]
Accuracy Score 0.3333333333333333
