In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import ExtraTreeClassifier

import xgboost as xgb

In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                'marital_status', 'occupation', 'relationship', 'race', 'sex',
                'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'class']

data = pd.read_csv(r'C:\Users\SHOCKER\Downloads\adult.data', names=column_names, na_values=' ?')

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
class                0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  31978 non-null  object
 14  class           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
data = data.dropna()

In [7]:
data.isnull().sum().sum()

0

In [8]:
for i in data.columns:
    print(data[i].unique())

[39 50 38 53 28 37 49 52 31 42 30 23 32 34 25 43 40 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 47 46 36 79 27 18 33 76 55 61 70 64 71 66 51 58
 26 17 60 90 75 65 77 62 63 67 74 72 69 68 73 81 78 88 80 84 83 85 82 86]
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' Self-emp-inc' ' Without-pay']
[ 77516  83311 215646 ...  84661 257302 201490]
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' 7th-8th' ' Doctorate' ' Assoc-voc' ' Prof-school'
 ' 5th-6th' ' 10th' ' Preschool' ' 12th' ' 1st-4th']
[13  9  7 14  5 10 12  4 16 11 15  3  6  1  8  2]
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
[' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Transport-moving' ' Farming-fishing'
 ' Machine-op-inspct' ' Tech-support' ' Craft-repair' ' Protective-serv'
 ' Armed-Forces' ' Priv-house-serv']
[' Not-in-fami

In [9]:
labels = data.pop('class')

In [30]:
count = 0
count2 = 0
for i in labels:
    if i == ' <=50K':
        count+=1
    else:
        count2+=1
print('Total observations =', count+count2)
print('Number of income <=50K =', count)
print('Number of income >50K =', count2 )

Total observations = 30162
Number of income <=50K = 22654
Number of income >50K = 7508


In [10]:
Le = LabelEncoder()
def preprocessing(data):
    categories = ['workclass', 'education', 'marital_status',
                  'occupation', 'relationship', 'race', 'native_country']
    data = pd.get_dummies(data, columns=categories)
    
    data['sex'] = Le.fit_transform(data['sex'])
    
    return data

In [11]:
data = preprocessing(data)

In [12]:
train_dataset = data.sample(frac=0.9, random_state=0)
test_dataset = data.drop(train_dataset.index)

In [13]:
train_labels = labels[train_dataset.index]
test_labels = labels[test_dataset.index]

In [14]:
print(train_dataset.shape)
print(test_dataset.shape)
print(train_labels.shape)
print(test_labels.shape)

(27146, 103)
(3016, 103)
(27146,)
(3016,)


In [15]:
train_labels= train_labels.map({' <=50K':0, ' >50K':1})
test_labels= test_labels.map({' <=50K':0, ' >50K':1})

In [16]:
def accuracy(model):
    model.fit(train_dataset, train_labels)
    tr_predictions = model.predict(train_dataset)
    te_predictions = model.predict(test_dataset)
    accuracy_train = accuracy_score(tr_predictions, train_labels)
    accuracy_test = accuracy_score(te_predictions, test_labels)
    print(f' Accuracy on training data - {accuracy_train}')
    print(f' Accuracy on testing data - {accuracy_test}')
    return accuracy_train, accuracy_test

In [17]:
model = KNeighborsClassifier()
accuracy(model)

 Accuracy on training data - 0.8294408015913947
 Accuracy on testing data - 0.763262599469496


(0.8294408015913947, 0.763262599469496)

In [18]:
model2 = ExtraTreeClassifier()
accuracy(model2)

 Accuracy on training data - 0.9999631621601709
 Accuracy on testing data - 0.7821618037135278


(0.9999631621601709, 0.7821618037135278)

In [19]:
model3 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='error')
accuracy(model3)

 Accuracy on training data - 0.9002062919030428
 Accuracy on testing data - 0.8696949602122016


(0.9002062919030428, 0.8696949602122016)