In [64]:
# Our Target is to Predict the income class of an individual using Logistic Regression.
# the Dataset "Adult" is obtained from kaggle.

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [66]:
# importing the dataset
df = pd.read_csv('C:\\Users\\ROOPKATHA\\Documents\\CV prep\\Income class prediction using Logistic Regression\\adult.csv')

Exploring the dataset

In [67]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [68]:
df.shape

(32561, 15)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [70]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [71]:
# shecking for null values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [72]:
df.nunique()

age                  73
workclass             9
fnlwgt            21648
education            16
education.num        16
marital.status        7
occupation           15
relationship          6
race                  5
sex                   2
capital.gain        119
capital.loss         92
hours.per.week       94
native.country       42
income                2
dtype: int64

In [73]:
# replacing the '?'-s with Null values
df[df=='?']=np.NAN

In [74]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [75]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [76]:
'''
workclass, occupation and native.country have null values in them
and all of them are categorical variables (objects).

we can impute the null values using the modal class 
for each of the variables.

''' 

'\nworkclass, occupation and native.country have null values in them\nand all of them are categorical variables (objects).\n\nwe can impute the null values using the modal class \nfor each of the variables.\n\n'

In [77]:
for a in ['workclass','occupation','native.country']:
    df[a].fillna(df[a].mode()[0],inplace=True)

In [78]:
df.isnull().sum()

# null values are imputed.

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [79]:
# creating the feature and the target variable.
# our Target variable is the income and all other variables are featires or predictors.

X = df.drop('income',axis = 1) #feature set.
Y = df['income'] # target variable.

In [80]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States


In [81]:
Y.head()

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: income, dtype: object

In [82]:
# splitiing the model in training and test dataset.
# I have chosen 70% of the data for training and the remaining 30% for testing.

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)

In [83]:
'''
Now we need to encode the categorical variables and scale the numeric variables.
all the categorical variables, except education, are nominal in nature.
therefore I am choosing One-hot encoding for the categorical variables (majority wins)
And for the numeric variables, I am going with the standard scaler since one-hot encoding 
is being used for the categorical variables here in the dataset. 
which means all the encoded values would be in the range of 0-1.

'''


'\nNow we need to encode the categorical variables and scale the numeric variables.\nall the categorical variables, except education, are nominal in nature.\ntherefore I am choosing One-hot encoding for the categorical variables (majority wins)\nAnd for the numeric variables, I am going with the standard scaler since one-hot encoding \nis being used for the categorical variables here in the dataset. \nwhich means all the encoded values would be in the range of 0-1.\n\n'

In [84]:
from sklearn import preprocessing

In [85]:
# Categorical variables
from sklearn import preprocessing
categorical = ['workclass','education','marital.status','occupation','relationship','race','sex','native.country']

one_hot=preprocessing.OneHotEncoder(sparse=False)
# Sparse is set to false since most of the categorical variables have low cardinality(<15).

X_train_encoded = one_hot.fit_transform(X_train[categorical])
X_test_encoded = one_hot.transform(X_test[categorical])

# Creating column names for the one-hot encoded features
encoded_columns = one_hot.get_feature_names_out(categorical)

# Creating DataFrames from the encoded data with the original feature names only.
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoded_columns, index=X_test.index)

# Concatenating the one-hot encoded DataFrames with the original DataFrames (column-wise)
X_train = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

# Dropping the original categorical columns
X_train.drop(categorical, axis=1, inplace=True)
X_test.drop(categorical, axis=1, inplace=True)

In [87]:
# Numeric features.

from sklearn.preprocessing import StandardScaler

numeric = ['age','fnlwgt','education.num', 'capital.gain','capital.loss','hours.per.week']

z = StandardScaler()

# Fitting and transforming the training data
X_train[numeric] = z.fit_transform(X_train[numeric])

# transforming the training data
X_test[numeric] = z.transform(X_test[numeric])

In [88]:
X_train.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
32098,0.101484,-1.494279,1.133894,-0.145189,-0.217407,-1.662414,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25206,0.028248,0.438778,-0.423425,-0.145189,-0.217407,-0.200753,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23491,0.247956,0.045292,-0.034095,-0.145189,-0.217407,-0.038346,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12367,-0.850587,0.793152,-0.423425,-0.145189,-0.217407,-0.038346,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7054,-0.044989,-0.853275,1.523223,-0.145189,-0.217407,-0.038346,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [89]:
# now we'll use the Logistic regression model.

from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()
logreg.fit(X_train,Y_train)
Y_predicted=logreg.predict(X_test)


In [109]:
# Now we have to check how well the model fits the real data
# I have used the following metrics - accuracy score, Precision, recall and F1-score.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# we have to mention the target class positive label here since target variable contains the values '<=50K' and '>50K', not numeric values like 0 and 1.
pos_label = '>50K'  

accuracy = accuracy_score(Y_test, Y_predicted)
precision = precision_score(Y_test, Y_predicted, pos_label=pos_label)
recall = recall_score(Y_test, Y_predicted, pos_label=pos_label)
F1_score = f1_score(Y_test, Y_predicted, pos_label=pos_label)

print('Accuracy:', accuracy*100)
print('Precision:', precision*100)
print('Recall:', recall*100)
print('F1-Score:', F1_score*100)


Accuracy: 84.57365134609479
Precision: 72.39747634069401
Recall: 58.372191606612965
F1-Score: 64.63271532504106


In [None]:
# Since, the data is already given to us, which means it is a real time data, it would be more appropriate to use the recall
# as the performence metrics instead of precision.
# and the recall value tells us that the model is around 60% accurate to reflect the real data.