In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold

warnings.filterwarnings('ignore')

In [36]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [37]:
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [38]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [39]:
df['marital-status'].replace(['Never-married','Widowed', 'Divorced','Separated'],0, inplace=True)
df['marital-status'].replace(['Married-civ-spouse','Married-spouse-absent', 'Married-AF-spouse'],1, inplace=True)

df['gender'].replace(['Male', 'Female'],[0,1],inplace=True)

In [40]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,0,Machine-op-inspct,Own-child,Black,0,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,1,Farming-fishing,Husband,White,0,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,1,Protective-serv,Husband,White,0,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,1,Machine-op-inspct,Husband,Black,0,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,0,?,Own-child,White,1,0,0,30,United-States,<=50K


In [41]:
x = df.drop(labels=['workclass','education','occupation', 'relationship', 'race','native-country',
       'income'], axis=1)

x.head()

Unnamed: 0,age,fnlwgt,educational-num,marital-status,gender,capital-gain,capital-loss,hours-per-week
0,25,226802,7,0,0,0,0,40
1,38,89814,9,1,0,0,0,50
2,28,336951,12,1,0,0,0,40
3,44,160323,10,1,0,7688,0,40
4,18,103497,10,0,1,0,0,30


In [44]:
y = df[['income']]


In [45]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [47]:
models = []
models.append(('LR',LogisticRegression()))
models.append(('KNN',KNeighborsClassifier()))

In [51]:
result = dict()
for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_result = cross_val_score(model,X_train, y_train, cv=kfold, scoring='accuracy')
    result[name] = (cv_result.mean(), cv_result.std())

    
print("name   results.mean   results.std")
for key, value in result.items():
    print(key,value)

name   results.mean   results.std
LR (0.79676465803315, 0.007893447863968865)
KNN (0.7758296593338624, 0.00965507224106801)
