## Prepare a classification model using Naive Bayes 

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [2]:
train_data=pd.read_csv("/content/SalaryData_Train.csv")
test_data=pd.read_csv("/content/SalaryData_Test.csv")

In [3]:
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [9]:
column_data=train_data.columns[train_data.dtypes=='object'].tolist()[:-1]

In [10]:
column_data

['workclass',
 'education',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native']

In [11]:
le=LabelEncoder()

In [12]:
for i in column_data:
  train_data[i]=le.fit_transform(train_data[i])
  test_data[i]=le.fit_transform(test_data[i])

In [13]:
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,9,13,4,0,1,4,1,2174,0,40,37,<=50K
1,50,4,9,13,2,3,0,4,1,0,0,13,37,<=50K
2,38,2,11,9,0,5,1,4,1,0,0,40,37,<=50K
3,53,2,1,7,2,5,0,2,1,0,0,40,37,<=50K
4,28,2,9,13,2,9,5,2,0,0,0,40,4,<=50K


In [14]:
test_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,2,1,7,4,6,3,2,1,0,0,40,37,<=50K
1,38,2,11,9,2,4,0,4,1,0,0,50,37,<=50K
2,28,1,7,12,2,10,0,4,1,0,0,40,37,>50K
3,44,2,15,10,2,6,0,2,1,7688,0,40,37,>50K
4,34,2,0,6,4,7,1,4,1,0,0,30,37,<=50K


In [21]:
mapping = {' >50K': 1, ' <=50K': 2}

In [22]:
train_data=train_data.replace({'Salary':mapping})
test_data=test_data.replace({'Salary':mapping})                 

In [23]:
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,9,13,4,0,1,4,1,2174,0,40,37,2
1,50,4,9,13,2,3,0,4,1,0,0,13,37,2
2,38,2,11,9,0,5,1,4,1,0,0,40,37,2
3,53,2,1,7,2,5,0,2,1,0,0,40,37,2
4,28,2,9,13,2,9,5,2,0,0,0,40,4,2


In [24]:
train_data.Salary.value_counts()

2    22653
1     7508
Name: Salary, dtype: int64

In [25]:
test_data.Salary.value_counts()

2    11360
1     3700
Name: Salary, dtype: int64

In [26]:
train_data.shape, test_data.shape

((30161, 14), (15060, 14))

In [27]:
X_train = train_data.iloc[:,0:13]
y_train = train_data.iloc[:,13]
X_test = test_data.iloc[:,0:13]
y_test = test_data.iloc[:,13]

## MULTINOMNALNB

In [28]:
MN=MultinomialNB()

In [29]:
MN.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
y_pred=MN.predict(X_test)

In [32]:
confusion_matrix(y_test,y_pred)

array([[  780,  2920],
       [  469, 10891]])

In [38]:
print(np.round(accuracy_score(y_test,y_pred),2)*100, '%')

77.0 %


## GaussianNB

In [39]:
GN=GaussianNB()

In [40]:
GN.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [41]:
y_pred=GN.predict(X_test)

In [42]:
confusion_matrix(y_test,y_pred)

array([[ 1209,  2491],
       [  601, 10759]])

In [43]:
print(np.round(accuracy_score(y_test,y_pred),2)*100, '%')

79.0 %


# CONCLUSION: GaussianNB Model has a better accuracy with 79%