In [1]:
# https://machinelearningmastery.com/imbalanced-classification-with-the-adult-income-dataset/

import pandas as pd

ad = pd.read_csv('adultKNN.csv')

ad.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [2]:
ad.shape

(48842, 15)

In [3]:
ad.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [4]:
ad.isnull().sum() # No Nulls hence find value counts n replace the ? with NAN

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [7]:
# Rename the colnames for simplicity

ad.rename(columns = {'educational-num': 'educationalnum', 'marital-status': 'maritalstatus'}, inplace = True)
ad.rename(columns = {'capital-gain': 'capitalgain', 'capital-loss': 'capitalloss', 
                     'hours-per-week':'hoursperweek', 'native-country':'nativecountry'}, 
          inplace = True)
ad.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educationalnum',
       'maritalstatus', 'occupation', 'relationship', 'race', 'gender',
       'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry',
       'income'],
      dtype='object')

In [8]:
ad.workclass.value_counts() # '?'
ad.education.value_counts()
ad.occupation.value_counts()# '?'
ad.fnlwgt.value_counts()
ad.educationalnum.value_counts() # Num of years edu persued
ad.maritalstatus.value_counts()
ad.relationship.value_counts()
ad.race.value_counts()
ad.gender.value_counts()
ad.capitalgain.value_counts()
ad.capitalloss.value_counts()
ad.hoursperweek.value_counts()
ad.nativecountry.value_counts()# '?'
ad.income.value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [9]:
ad.columns[ad.isin({'?'}).sum() > 0]

Index(['workclass', 'occupation', 'nativecountry'], dtype='object')

In [10]:
# Lets convert ? to Null
import numpy as np

ad = ad.replace({'?' : np.nan})

In [11]:
ad.columns[ad.isin({'?'}).sum() > 0] # REturns zero values

ad.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
educationalnum       0
maritalstatus        0
occupation        2809
relationship         0
race                 0
gender               0
capitalgain          0
capitalloss          0
hoursperweek         0
nativecountry      857
income               0
dtype: int64

In [12]:
#ad.occupation.value_counts() # Other-service
#ad.workclass.value_counts() # Private
ad.nativecountry.value_counts() # Unite

United-States                 43832
Mexico                          951
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador                     

In [13]:
ad.workclass.value_counts()
ad.occupation.value_counts()

wt = pd.DataFrame({"Workclass":ad.workclass, 'Occ' : ad.occupation, 'Nat':ad.nativecountry, 'Wts':ad.fnlwgt})
wt.sort_values('Wts', ascending = False)

Unnamed: 0,Workclass,Occ,Nat,Wts
7974,Private,Exec-managerial,United-States,1490400
30730,Private,Exec-managerial,United-States,1484705
34419,Private,Craft-repair,United-States,1455435
33020,Private,Other-service,United-States,1366120
31850,Private,Tech-support,United-States,1268339
...,...,...,...,...
1060,Private,Prof-specialty,United-States,13769
7668,Private,Exec-managerial,United-States,13769
43488,Private,Machine-op-inspct,United-States,13769
16044,State-gov,Protective-serv,United-States,13492


In [14]:
ad.occupation.fillna('Other-service', inplace=True)
ad.workclass.fillna('Private', inplace=True)
ad.nativecountry.fillna('United-States', inplace=True)

In [15]:
ad.columns[ad.dtypes == 'object']

Index(['workclass', 'education', 'maritalstatus', 'occupation', 'relationship',
       'race', 'gender', 'nativecountry', 'income'],
      dtype='object')

In [16]:
ad.relationship.value_counts()

Husband           19716
Not-in-family     12583
Own-child          7581
Unmarried          5125
Wife               2331
Other-relative     1506
Name: relationship, dtype: int64

In [17]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [18]:
ad.workclass = le.fit_transform(ad.workclass)
ad.education = le.fit_transform(ad.education)
ad.maritalstatus = le.fit_transform(ad.maritalstatus)
ad.occupation = le.fit_transform(ad.occupation)
ad.relationship = le.fit_transform(ad.relationship)
ad.race = le.fit_transform(ad.race)
ad.gender = le.fit_transform(ad.gender)
ad.nativecountry = le.fit_transform(ad.nativecountry)
ad.income = le.fit_transform(ad.income) # (<=50K : 0, >50K : 1)

In [19]:
ad.columns[ad.dtypes == 'object']

Index([], dtype='object')

In [20]:
ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             48842 non-null  int64
 1   workclass       48842 non-null  int32
 2   fnlwgt          48842 non-null  int64
 3   education       48842 non-null  int32
 4   educationalnum  48842 non-null  int64
 5   maritalstatus   48842 non-null  int32
 6   occupation      48842 non-null  int32
 7   relationship    48842 non-null  int32
 8   race            48842 non-null  int32
 9   gender          48842 non-null  int32
 10  capitalgain     48842 non-null  int64
 11  capitalloss     48842 non-null  int64
 12  hoursperweek    48842 non-null  int64
 13  nativecountry   48842 non-null  int32
 14  income          48842 non-null  int32
dtypes: int32(9), int64(6)
memory usage: 3.9 MB


In [21]:
ad.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationalnum,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry,income
0,25,3,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,3,89814,11,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,1
3,44,3,160323,15,10,2,6,0,2,1,7688,0,40,38,1
4,18,3,103497,15,10,4,7,3,4,0,0,0,30,38,0


In [30]:
# Model building for Logistic regression

# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8

from sklearn.model_selection import train_test_split

ad_train, ad_test = train_test_split(ad, test_size = .2)

ad_train_x = ad_train.iloc[ : , 0:-1] 
ad_train_y = ad_train.iloc[:, -1]

ad_test_x = ad_test.iloc[ : , 0:-1] 
ad_test_y = ad_test.iloc[:, -1]

ad_test_x.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationalnum,maritalstatus,occupation,relationship,race,gender,capitalgain,capitalloss,hoursperweek,nativecountry
32251,32,3,207668,11,9,2,2,0,4,1,0,0,50,38
47268,18,3,194561,15,10,4,7,3,4,1,0,0,37,38
1780,65,3,258973,15,10,6,7,1,4,0,401,0,14,38
21481,43,0,144778,9,13,4,3,1,4,1,0,0,40,38
31849,31,3,115488,11,9,2,4,0,4,1,0,0,40,38


In [24]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

In [31]:
logreg.fit(ad_train_x, ad_train_y)

LogisticRegression()

In [32]:
ad_pred_y = logreg.predict(ad_test_x)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(ad_test_x, ad_test_y)))

Accuracy of logistic regression classifier on test set: 0.79


In [33]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(ad_test_y, ad_pred_y)
print(confusion_matrix)

[[7028  373]
 [1693  675]]


In [35]:
from sklearn.metrics import classification_report

print(classification_report(ad_test_y, ad_pred_y))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87      7401
           1       0.64      0.29      0.40      2368

    accuracy                           0.79      9769
   macro avg       0.72      0.62      0.63      9769
weighted avg       0.77      0.79      0.76      9769



In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()