# Adult_Income - Logistic Regression

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/adult-income-new.csv', header=None, na_values='?', names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
df.isin([' ?']).sum(axis=0)

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [4]:
for i in ['workclass', 'occupation', 'native-country']:
    df[i] = df[i].replace(' ?', np.nan)

In [5]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
df.shape

(32561, 15)

In [8]:
# Dropping the missing values
df.dropna(inplace=True) 

In [9]:
df.shape

(30162, 15)

In [10]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [11]:
# Label Encoder: Convert categorical to numeric

from sklearn.preprocessing import LabelEncoder
for i in ['income', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']:
    df[i] = LabelEncoder().fit_transform(df[i])

In [12]:
# Dropping unwanted columns
df.drop(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'], axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,income
0,5,9,4,0,1,4,1,0
1,4,9,2,3,0,4,1,0
2,2,11,0,5,1,4,1,0
3,2,1,2,5,0,2,1,0
4,2,9,2,9,5,2,0,0


In [14]:
df.describe()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,income
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,2.199324,10.333764,2.580134,5.95985,1.418341,3.678602,0.675685,0.248922
std,0.953925,3.812292,1.498016,4.029566,1.601338,0.834709,0.468126,0.432396
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,9.0,2.0,2.0,0.0,4.0,0.0,0.0
50%,2.0,11.0,2.0,6.0,1.0,4.0,1.0,0.0
75%,2.0,12.0,4.0,9.0,3.0,4.0,1.0,0.0
max,6.0,15.0,6.0,13.0,5.0,4.0,1.0,1.0


In [15]:
df.skew() 

workclass         1.154858
education        -0.944948
marital-status   -0.009453
occupation        0.110450
relationship      0.820311
race             -2.507036
sex              -0.750636
income            1.161408
dtype: float64

In [16]:
from scipy.stats import zscore

z = abs(zscore(df))

print(df.shape)

df_final = df.loc[(z<3).all(axis=1)]
df_final.shape

(30162, 8)


(28968, 8)

In [17]:
df_final.describe()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,income
count,28968.0,28968.0,28968.0,28968.0,28968.0,28968.0,28968.0,28968.0
mean,2.197425,10.329502,2.575117,5.958161,1.410246,3.797604,0.676333,0.249448
std,0.946564,3.822357,1.50155,4.031315,1.598177,0.596545,0.467883,0.432701
min,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
25%,2.0,9.0,2.0,2.0,0.0,4.0,0.0,0.0
50%,2.0,11.0,2.0,6.0,1.0,4.0,1.0,0.0
75%,2.0,12.0,4.0,9.0,3.0,4.0,1.0,0.0
max,5.0,15.0,6.0,13.0,5.0,4.0,1.0,1.0


In [18]:
x = df_final.iloc[:, :-1]
y = df_final.iloc[:, -1]

In [19]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

xx = sc.fit_transform(x)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

max_acc_scr = 0
for r_st in range(70, 80):
    x_train, x_test, y_train, y_test = train_test_split(xx, y, test_size = 0.3, random_state = r_st)
    reg = LogisticRegression()
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    
    acc_scr = accuracy_score(y_test, y_pred)
    
    if max_acc_scr < acc_scr:
        max_acc_scr = acc_scr
        final_r_st = r_st
print('Max accuracy_score %s according to random state : %s' %(max_acc_scr, r_st))

Max accuracy_score 0.7534230813485214 according to random state : 79


In [21]:
from sklearn.model_selection import cross_val_score

print('Mean: %s' % cross_val_score(LogisticRegression(), xx, y, cv=10, scoring='accuracy').mean())

Mean: 0.7455469746756502


In [22]:
x_train, x_test, y_train, y_test = train_test_split(xx, y, test_size = 0.3, random_state = 79)
lg = LogisticRegression()
lg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
acc_scr = accuracy_score(y_test, y_pred)

In [23]:
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))
print('\nClassification Report : \n', classification_report(y_test, y_pred))

Confusion Matrix : 
 [[6441   49]
 [2190   11]]

Classification Report : 
               precision    recall  f1-score   support

           0       0.75      0.99      0.85      6490
           1       0.18      0.00      0.01      2201

    accuracy                           0.74      8691
   macro avg       0.46      0.50      0.43      8691
weighted avg       0.60      0.74      0.64      8691



In [24]:
import joblib

joblib.dump(lg, 'lr_adult-income.pkl')

['lr_adult-income.pkl']