## Import libraries

In [803]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, f1_score, recall_score,confusion_matrix

## Basic EDA

In [804]:
data= pd.read_csv("../Downloads/adult.csv")
df= pd.DataFrame(data)
#Displaying number of rows
count_row = df.shape[0]
print(df.columns)
print("Number of rows: ",count_row)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')
Number of rows:  32561


In [805]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']

print('Dataset has {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

Dataset has 9 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']


In [806]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']

print('Dataset has {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :\n\n', numerical)

Dataset has 6 numerical variables

The numerical variables are :

 ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']


In [807]:
#replace '?' in dataset with NaN
df.replace('?', np.nan, inplace=True)

In [808]:
#checking cardinality in categorical variables
for var in categorical:
    print(var, 'consists of', len(df[var].unique()),'labels')

workclass consists of 9 labels
education consists of 16 labels
marital.status consists of 7 labels
occupation consists of 15 labels
relationship consists of 6 labels
race consists of 5 labels
sex consists of 2 labels
native.country consists of 42 labels
income consists of 2 labels


In [809]:
#checking cardinality in numerical variables
for var in numerical:
    print(var, 'consists of', len(df[var].unique()),'labels')

age consists of 73 labels
fnlwgt consists of 21648 labels
education.num consists of 16 labels
capital.gain consists of 119 labels
capital.loss consists of 92 labels
hours.per.week consists of 94 labels


### Checking for missing values

In [810]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [811]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### Handling missing values

In [812]:
val=int(input("\n1.Handling missing values by dropping\n2.Handling missing values using Mode\nChoose one strategy :"))
if val == 1:
    #dropping rows with NaN
    df=df.dropna()
    df.head()
    
else:
    def imputenull(MyFrame,ColName):
        most_frequent_category=MyFrame[ColName].mode()[0]
        # replace null values with most occured category
        MyFrame[ColName].fillna(most_frequent_category,inplace=True)
    #replacing missing values in workclass,occupation and native.country with their modes
    for Columns in ['workclass','occupation','native.country']:
        imputenull(df,Columns)
    df.isnull().sum()


1.Handling missing values by dropping
2.Handling missing values using Mode
Choose one strategy :2


In [813]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [814]:
df.shape

(32561, 15)

In [815]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [816]:
df['age']=pd.qcut(df['age'], q=3,labels=['low','medium', 'high'])
df['education.num']=pd.qcut(df['education.num'], q=3,labels=['low','medium', 'high'])
df['fnlwgt']=pd.qcut(df['fnlwgt'], q=3,labels=['low','medium', 'high'])
df['hours.per.week']=pd.cut(df['hours.per.week'],bins=3,labels=['low','medium', 'high'])
df['capital.loss']=pd.cut(df['capital.loss'],bins=3,labels=['low','medium', 'high'])
df['capital.gain']=pd.cut(df['capital.gain'],bins=3,labels=['low','medium', 'high'])

In [817]:
df['income'].replace(['<=50K', '>50K'],[0, 1], inplace=True)

In [818]:
df.shape

(32561, 15)

### Splitting dataset into train and test

In [819]:
#declaring feature vectors and target variable
X = df                  #feature matrix
y = df['income']        #target variable

In [820]:
#splitting dataset into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
 

In [821]:
X_test.shape

(9769, 15)

## Building the Naive Bayes Classifier

Naive Bayes probability -
X= x1,x2,x3 (feature matrix)
y= 'income'   (target variable)
p(y/X)= max(P(y=0/X),P(y=1/X))

In [822]:
#initialising a list that will hold all the predicted outcomes
predicted=[]

In [823]:
#this function both trains and tests the datasets passed to it using the equiwidth binning approach
def BuildAndTest(dataset, testset):
    #this function calculates number of occurences of given label
    def count(df,colname,label,target):
        condition = (df[colname] == label) & (df['income'] == target)
        return len(df[condition])

    count_no = count(dataset,'income',0,0)  
    count_yes = count(dataset,'income',1,1)

    prob_0 = count_no/len(dataset)
    prob_1 = count_yes/len(dataset)

    probabilities = {0:{},1:{}}
    for col in dataset.columns[:-1]:
        probabilities[0][col] = {}
        probabilities[1][col] = {}
        labels = sorted(list(dataset[col].unique()))
        for category in labels:
                count_ct_0 = count(dataset,col,category,0)
                count_ct_1 = count(dataset,col,category,1)
                probabilities[0][col][category] = count_ct_0 / count_no
                probabilities[1][col][category] = count_ct_1 / count_yes
    
    print("example of probability of continuous attribute 'age':",probabilities[0]['age'])
    print("example of probability of categorical attribute 'marital.status':",probabilities[1]['marital.status'])
    global predicted
    for row in range(0,len(testset)):
        prod_0 = prob_0
        prod_1 = prob_1

        for feature in testset.columns[:-1]:
            prod_0 *= probabilities[0][feature][testset[feature].iloc[row]]
            prod_1 *= probabilities[1][feature][testset[feature].iloc[row]]

        #Predict the outcome
        if prod_0 > prod_1:
            predicted.append(0)
        else:
            predicted.append(1)


## Accuracy 

In [824]:
def accuracymeasure(testset):
    num_correct = 0
    global predicted
    for i in range(len(predicted)):
        if predicted[i] == testset['income'].iloc[i]:
            num_correct += 1
    print("Confusion matrix showing TP FP FN TN\n",confusion_matrix(testset['income'],predicted))
    print('Accuracy of your model is',num_correct / len(predicted))
    #Precision = TP/(TP+FP) 
    precision = precision_score(testset['income'], predicted,average='binary')
    print("Your precision score is :",precision)
    #Recall = TP/(TP+FN)	
    recall = recall_score(testset['income'], predicted, average='binary')
    print("Your recall score is :",recall)
    #F1 = precision.recall/(precision+recall)
    print("Your F1 score is :",f1_score(testset['income'], predicted))

## Testing 

In [825]:
value = int(input("\n1.Building classifier using binning method\n2.Building classifier assuming gaussian distribution\nChoose one strategy :"))
if value == 1:
    BuildAndTest(X_train,X_test)
    accuracymeasure(X_test)
    pred=pd.DataFrame(predicted)
    pred.replace([0, 1],['<=50K', '>50K'], inplace=True)
    print(pred)
    
else:
    print("gaussian")


1.Building classifier using binning method
2.Building classifier assuming gaussian distribution
Choose one strategy :1
example of probability of continuous attribute 'age': {'high': 0.2727325245522819, 'low': 0.42744078567302135, 'medium': 0.29982668977469673}
example of probability of categorical attribute 'marital.status': {'Divorced': 0.06129149945275447, 'Married-AF-spouse': 0.0012769062385990515, 'Married-civ-spouse': 0.8549799343305363, 'Married-spouse-absent': 0.004013133892739876, 'Never-married': 0.06092666909886903, 'Separated': 0.00784385260853703, 'Widowed': 0.009668004377964246}
Confusion matrix showing TP FP FN TN
 [[6245 1165]
 [ 639 1720]]
Accuracy of your model is 0.8153342204933974
Your precision score is : 0.5961871750433275
Your recall score is : 0.7291225095379398
Your F1 score is : 0.6559877955758963
          0
0     <=50K
1     <=50K
2     <=50K
3      >50K
4      >50K
...     ...
9764  <=50K
9765  <=50K
9766  <=50K
9767  <=50K
9768   >50K

[9769 rows x 1 colum

In [826]:
file= pd.read_csv("../Downloads/test.csv")


## 10-Fold Cross Validation

In [827]:
#K-FOLD CROSS VALIDATION
fold1 = df.loc[0:3255]                                            
fold2 = df.loc[3256:6511]
fold3 = df.loc[6512:9767]
fold4 = df.loc[9768:13023]
fold5 = df.loc[13024:16279]
fold6 = df.loc[16280:19535]                                            
fold7 = df.loc[19536:22792]
fold8 = df.loc[22793:26048]
fold9 = df.loc[26049:29304]
fold10 = df.loc[29304:32560]

In [828]:
train_val10 = pd.concat([fold1, fold2, fold3, fold4,fold5, fold6, fold7, fold8, fold10 ])
test_val10 = fold10

train_val9 = pd.concat([fold1, fold2, fold3, fold4,fold5, fold6, fold7, fold8, fold9 ])
test_val9 = fold9

train_val8 = pd.concat([fold1, fold2, fold3, fold4,fold5, fold6, fold7, fold9, fold10 ])
test_val8 = fold8

train_val7 = pd.concat([fold1, fold2, fold3, fold4,fold5, fold6, fold8, fold9, fold10 ])
test_val7 = fold7

train_val6 = pd.concat([fold1, fold2, fold3, fold4,fold5, fold7, fold8, fold9, fold10 ])
test_val6 = fold6

train_val5 = pd.concat([fold1, fold2, fold3, fold4, fold6, fold7, fold8, fold9, fold10 ])
test_val5 = fold5

train_val4 = pd.concat([fold1, fold2, fold3,fold5, fold6, fold7, fold8, fold9, fold10 ])
test_val4 = fold4

train_val3 = pd.concat([fold1, fold2, fold4,fold5, fold6, fold7, fold8, fold9, fold10 ])
test_val3 = fold3

train_val2 = pd.concat([fold1, fold3, fold4,fold5, fold6, fold7, fold8, fold9, fold10 ])
test_val2 = fold2

train_val1 = pd.concat([fold2, fold3, fold4,fold5, fold6, fold7, fold8, fold9, fold10 ])
test_val1 = fold1

In [829]:
predicted=[]
BuildAndTest(train_val10,test_val10)
accuracymeasure(test_val10)
print("\n")
predicted=[]
BuildAndTest(train_val9,test_val9)
accuracymeasure(test_val9)
predicted=[]
print("\n")
BuildAndTest(train_val8,test_val8)
accuracymeasure(test_val8)
predicted=[]
print("\n")
BuildAndTest(train_val7,test_val7)
accuracymeasure(test_val7)
predicted=[]
print("\n")
BuildAndTest(train_val6,test_val6)
accuracymeasure(test_val6)
predicted=[]
print("\n")
BuildAndTest(train_val5,test_val5)
accuracymeasure(test_val5)
predicted=[]
print("\n")
BuildAndTest(train_val4,test_val4)
accuracymeasure(test_val4)
predicted=[]
print("\n")
BuildAndTest(train_val3,test_val3)
accuracymeasure(test_val3)
predicted=[]
print("\n")
BuildAndTest(train_val2,test_val2)
accuracymeasure(test_val2)
predicted=[]
print("\n")

example of probability of continuous attribute 'age': {'high': 0.27327055414704815, 'low': 0.4268381021369069, 'medium': 0.2998913437160449}
example of probability of categorical attribute 'marital.status': {'Divorced': 0.05971183153228041, 'Married-AF-spouse': 0.0013854253255749516, 'Married-civ-spouse': 0.8513438625658077, 'Married-spouse-absent': 0.0042948185092823495, 'Never-married': 0.06386810750900526, 'Separated': 0.00886672208367969, 'Widowed': 0.010529232474369632}
Confusion matrix showing TP FP FN TN
 [[2198  431]
 [ 164  464]]
Accuracy of your model is 0.8173165489714461
Your precision score is : 0.5184357541899441
Your recall score is : 0.7388535031847133
Your F1 score is : 0.6093237032173342


example of probability of continuous attribute 'age': {'high': 0.27267789244975554, 'low': 0.427666123483614, 'medium': 0.29965598406663047}
example of probability of categorical attribute 'marital.status': {'Divorced': 0.05781228337723555, 'Married-AF-spouse': 0.0011091085539997228

In [830]:
file= pd.read_csv("../Downloads/test.csv")
file['age']=pd.qcut(file['age'], q=3,labels=['low','medium', 'high'])
file['education.num']=pd.qcut(file['education.num'], q=3,labels=['low','medium', 'high'])
file['fnlwgt']=pd.qcut(file['fnlwgt'], q=3,labels=['low','medium', 'high'])
file['hours.per.week']=pd.cut(file['hours.per.week'],bins=3,labels=['low','medium', 'high'])
file['capital.loss']=pd.cut(file['capital.loss'],bins=3,labels=['low','medium', 'high'])
file['capital.gain']=pd.cut(file['capital.gain'],bins=3,labels=['low','medium', 'high'])
file.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,low,Private,high,11th,low,Never-married,Machine-op-inspct,Own-child,Black,Male,low,low,medium,United-States,<=50K
1,medium,Private,low,HS-grad,low,Married-civ-spouse,Farming-fishing,Husband,White,Male,low,low,medium,United-States,<=50K
2,low,Local-gov,high,Assoc-acdm,high,Married-civ-spouse,Protective-serv,Husband,White,Male,low,low,medium,United-States,>50K
3,medium,Private,medium,Some-college,medium,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,low,low,medium,United-States,>50K
4,low,?,low,Some-college,medium,Never-married,?,Own-child,White,Female,low,low,low,United-States,<=50K


In [831]:
file.replace('?', np.NaN, inplace=True)
file=file.dropna()
file['income'].replace(["<=50K", ">50K"],[0,1], inplace=True)

In [832]:
predicted=[]
BuildAndTest(df,file)

example of probability of continuous attribute 'age': {'high': 0.2714805825242718, 'low': 0.4286003236245955, 'medium': 0.2999190938511327}
example of probability of categorical attribute 'marital.status': {'Divorced': 0.05904859074097692, 'Married-AF-spouse': 0.0012753475322025251, 'Married-civ-spouse': 0.8534625685499299, 'Married-spouse-absent': 0.004336181609488586, 'Never-married': 0.06261956383114399, 'Separated': 0.008417293712536666, 'Widowed': 0.010840454023721463}


In [833]:
print(predicted)

[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 

In [834]:
pred=pd.DataFrame(predicted)
pred.replace([0, 1],['<=50K', '>50K'], inplace=True)
print(pred)

           0
0      <=50K
1      <=50K
2       >50K
3      <=50K
4      <=50K
...      ...
15055  <=50K
15056  <=50K
15057   >50K
15058  <=50K
15059   >50K

[15060 rows x 1 columns]


In [835]:
accuracymeasure(file)

Confusion matrix showing TP FP FN TN
 [[9456 1904]
 [ 935 2765]]
Accuracy of your model is 0.8114873837981408
Your precision score is : 0.5922038980509745
Your recall score is : 0.7472972972972973
Your F1 score is : 0.6607718962839049
