## Classification - Numerical Columns

In [None]:
# load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
  
# store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target

# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
  
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
  
# making predictions on the testing set
y_pred = gnb.predict(X_test)
  
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)


Gaussian Naive Bayes model accuracy(in %): 95.0


In [5]:
from sklearn import model_selection
scores = model_selection.cross_val_score(gnb, X, y, 
											cv=5, 
											scoring='accuracy')
print("Gaussian Naive Bayes model accuracy(in %):", scores*100)
print("Accuracy: %0.2f (+/- %0.2f)"
		% (scores.mean()*100, scores.std()*100))

Gaussian Naive Bayes model accuracy(in %): [ 93.33333333  96.66666667  93.33333333  93.33333333 100.        ]
Accuracy: 95.33 (+/- 2.67)


In [14]:
# predict the target on the test dataset
predict_test = gnb.predict(X_test)
# Accuracy Score on test dataset
accuracy_test = metrics.accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

accuracy_score on test dataset :  0.9333333333333333


## Classification - Categorical Columns

In [43]:
# importing required libraries
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# read the train and test dataset
df = pd.read_csv('./Datasets/play_tennis.csv')
df.drop(columns="day", inplace=True)
df.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


**ENCODE CATEGORICAL COLUMNS**

In [44]:
df_encoded = pd.get_dummies(df , drop_first=True)
df_encoded.head()

Unnamed: 0,outlook_Rain,outlook_Sunny,temp_Hot,temp_Mild,humidity_Normal,wind_Weak,play_Yes
0,0,1,1,0,0,1,0
1,0,1,1,0,0,0,0
2,0,0,1,0,0,1,1
3,1,0,0,1,0,1,1
4,1,0,0,0,1,1,1


In [45]:
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]

In [46]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# train_data = pd.read_csv('train-data.csv')
# test_data = pd.read_csv('test-data.csv')

# shape of the dataset
print('Shape of training data :',X_train.shape)
print('Shape of testing data :',X_test.shape)

Shape of training data : (11, 6)
Shape of testing data : (3, 6)


In [None]:
'''
Create the object of the Naive Bayes model
You can also add other parameters and test your code here
Some parameters are : var_smoothing
Documentation of sklearn GaussianNB: 

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

 '''
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()

# Create a list of labels for the classifiers
labels = ['GaussianNB', 'MultinomialNB', 'BernoulliNB']

# Loop through the classifiers and perform 5-fold cross-validation for each
for model, label in zip([model1, model2, model3], labels):
    # fit the model with the training data
    model.fit(X_train,y_train)

    # predict the target on the train dataset
    predict_train = model.predict(X_train)
    # print('Target on train data',predict_train) 

    # Accuray Score on train dataset
    print("-"*20+label+"-"*20)
    accuracy_train = accuracy_score(y_train,predict_train)
    print('accuracy_score on train dataset : ', accuracy_train)

    # predict the target on the test dataset
    predict_test = model.predict(X_test)
    # print('Target on test data',predict_test) 

    # Accuracy Score on test dataset
    accuracy_test = accuracy_score(y_test,predict_test)
    print('accuracy_score on test dataset : ', accuracy_test)

5-fold cross validation:

--------------------GaussianNB--------------------
accuracy_score on train dataset :  0.9090909090909091
accuracy_score on test dataset :  0.6666666666666666
--------------------MultinomialNB--------------------
accuracy_score on train dataset :  0.7272727272727273
accuracy_score on test dataset :  0.6666666666666666
--------------------BernoulliNB--------------------
accuracy_score on train dataset :  0.9090909090909091
accuracy_score on test dataset :  0.6666666666666666


**Label Encoder**

*Using OHE ENCODE we getting low accuracy trying label Encoder*

In [48]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column]=le.fit_transform(df[column])



In [49]:
df.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1


In [50]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [51]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# train_data = pd.read_csv('train-data.csv')
# test_data = pd.read_csv('test-data.csv')

# shape of the dataset
print('Shape of training data :',X_train.shape)
print('Shape of testing data :',X_test.shape)

Shape of training data : (11, 4)
Shape of testing data : (3, 4)


In [52]:
'''
Create the object of the Naive Bayes model
You can also add other parameters and test your code here
Some parameters are : var_smoothing
Documentation of sklearn GaussianNB: 

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

 '''
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()

# Create a list of labels for the classifiers
labels = ['GaussianNB', 'MultinomialNB', 'BernoulliNB']

# Loop through the classifiers and perform 5-fold cross-validation for each
for model, label in zip([model1, model2, model3], labels):
    # fit the model with the training data
    model.fit(X_train,y_train)

    # predict the target on the train dataset
    predict_train = model.predict(X_train)
    # print('Target on train data',predict_train) 

    # Accuray Score on train dataset
    print("-"*20+label+"-"*20)
    accuracy_train = accuracy_score(y_train,predict_train)
    print('accuracy_score on train dataset : ', accuracy_train)

    # predict the target on the test dataset
    predict_test = model.predict(X_test)
    # print('Target on test data',predict_test) 

    # Accuracy Score on test dataset
    accuracy_test = accuracy_score(y_test,predict_test)
    print('accuracy_score on test dataset : ', accuracy_test)

--------------------GaussianNB--------------------
accuracy_score on train dataset :  0.9090909090909091
accuracy_score on test dataset :  0.6666666666666666
--------------------MultinomialNB--------------------
accuracy_score on train dataset :  0.7272727272727273
accuracy_score on test dataset :  0.6666666666666666
--------------------BernoulliNB--------------------
accuracy_score on train dataset :  0.8181818181818182
accuracy_score on test dataset :  0.6666666666666666


## Classification - Numerical and Categorical Column

In [3]:
# importing required libraries
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

# read the train and test dataset
df = pd.read_csv('./Datasets/Social_Network_Ads.csv')
df.drop(columns="User ID", inplace=True)
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


**ENCODE CATEGORICAL COLUMNS**

In [4]:
df_encoded = pd.get_dummies(df , drop_first=True)
df_encoded.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [10]:
X = df_encoded.iloc[:, [0, 1, 3]]
y = df_encoded.iloc[:, [2]]
y

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [9]:
X

Unnamed: 0,Age,EstimatedSalary,Gender_Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


In [11]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# train_data = pd.read_csv('train-data.csv')
# test_data = pd.read_csv('test-data.csv')

# shape of the dataset
print('Shape of training data :',X_train.shape)
print('Shape of testing data :',X_test.shape)

Shape of training data : (320, 3)
Shape of testing data : (80, 3)


In [12]:
'''
Create the object of the Naive Bayes model
You can also add other parameters and test your code here
Some parameters are : var_smoothing
Documentation of sklearn GaussianNB: 

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

 '''
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()

# Create a list of labels for the classifiers
labels = ['GaussianNB', 'MultinomialNB', 'BernoulliNB']

# Loop through the classifiers and perform 5-fold cross-validation for each
for model, label in zip([model1, model2, model3], labels):
    # fit the model with the training data
    model.fit(X_train,y_train)

    # predict the target on the train dataset
    predict_train = model.predict(X_train)
    # print('Target on train data',predict_train) 

    # Accuray Score on train dataset
    print("-"*20+label+"-"*20)
    accuracy_train = accuracy_score(y_train,predict_train)
    print('accuracy_score on train dataset : ', accuracy_train)

    # predict the target on the test dataset
    predict_test = model.predict(X_test)
    # print('Target on test data',predict_test) 

    # Accuracy Score on test dataset
    accuracy_test = accuracy_score(y_test,predict_test)
    print('accuracy_score on test dataset : ', accuracy_test)

--------------------GaussianNB--------------------
accuracy_score on train dataset :  0.9
accuracy_score on test dataset :  0.8625
--------------------MultinomialNB--------------------
accuracy_score on train dataset :  0.69375
accuracy_score on test dataset :  0.55
--------------------BernoulliNB--------------------
accuracy_score on train dataset :  0.653125
accuracy_score on test dataset :  0.6


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


**Label Encoder**

*Using OHE ENCODE we getting low accuracy trying label Encoder*

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column]=le.fit_transform(df[column])

In [14]:
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


In [15]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [16]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# train_data = pd.read_csv('train-data.csv')
# test_data = pd.read_csv('test-data.csv')

# shape of the dataset
print('Shape of training data :',X_train.shape)
print('Shape of testing data :',X_test.shape)

Shape of training data : (320, 3)
Shape of testing data : (80, 3)


In [17]:
'''
Create the object of the Naive Bayes model
You can also add other parameters and test your code here
Some parameters are : var_smoothing
Documentation of sklearn GaussianNB: 

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

 '''
model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()

# Create a list of labels for the classifiers
labels = ['GaussianNB', 'MultinomialNB', 'BernoulliNB']

# Loop through the classifiers and perform 5-fold cross-validation for each
for model, label in zip([model1, model2, model3], labels):
    # fit the model with the training data
    model.fit(X_train,y_train)

    # predict the target on the train dataset
    predict_train = model.predict(X_train)
    # print('Target on train data',predict_train) 

    # Accuray Score on train dataset
    print("-"*20+label+"-"*20)
    accuracy_train = accuracy_score(y_train,predict_train)
    print('accuracy_score on train dataset : ', accuracy_train)

    # predict the target on the test dataset
    predict_test = model.predict(X_test)
    # print('Target on test data',predict_test) 

    # Accuracy Score on test dataset
    accuracy_test = accuracy_score(y_test,predict_test)
    print('accuracy_score on test dataset : ', accuracy_test)

--------------------GaussianNB--------------------
accuracy_score on train dataset :  0.9
accuracy_score on test dataset :  0.8625
--------------------MultinomialNB--------------------
accuracy_score on train dataset :  0.69375
accuracy_score on test dataset :  0.55
--------------------BernoulliNB--------------------
accuracy_score on train dataset :  0.653125
accuracy_score on test dataset :  0.6
