In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Importing dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/Aditya-Mankar/Census-Income-Prediction/master/adult.csv')

In [3]:
# Preview dataset
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


#### Preprocessing

In [4]:
dataset = dataset.replace('?', np.nan)

In [6]:
# Checking null values
dataset.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [7]:
columns_with_nan = ['workclass', 'occupation', 'native.country']

In [8]:
for col in columns_with_nan:
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)

#### Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encoder = LabelEncoder()
for col in dataset.columns:
    if dataset[col].dtypes == 'object':
        dataset[col] = encoder.fit_transform(dataset[col])

In [11]:
X = dataset.drop('income', axis=1)
Y = dataset['income']

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

log_reg.fit(X_train, Y_train)
Y_pred_log_reg = log_reg.predict(X_test)

#### Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()

dec_tree.fit(X_train, Y_train)
Y_pred_dec_tree = dec_tree.predict(X_test)

#### Random FOrest

In [16]:
from sklearn.ensemble import RandomForestClassifier
ran_for = RandomForestClassifier(n_estimators=10)

ran_for.fit(X_train, Y_train)
Y_pred_ran_for = ran_for.predict(X_test)

#### Model Evaluation

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [18]:
print('Logistic Regression:')
print('Accuracy score:', round(accuracy_score(Y_test, Y_pred_log_reg) * 100, 2))
print('F1 score:', round(f1_score(Y_test, Y_pred_log_reg) * 100, 2))

Logistic Regression:
Accuracy score: 79.2
F1 score: 38.55


In [19]:
print('Decision Tree Classifier:')
print('Accuracy score:', round(accuracy_score(Y_test, Y_pred_dec_tree) * 100, 2))
print('F1 score:', round(f1_score(Y_test, Y_pred_dec_tree) * 100, 2))

Decision Tree Classifier:
Accuracy score: 80.87
F1 score: 59.94


In [20]:
print('Random Forest Classifier:')
print('Accuracy score:', round(accuracy_score(Y_test, Y_pred_ran_for) * 100, 2))
print('F1 score:', round(f1_score(Y_test, Y_pred_ran_for) * 100, 2))

Random Forest Classifier:
Accuracy score: 84.86
F1 score: 63.99


#### Hyperparameter Tuning

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [27]:
n_estimators = [5,10,15,20,50]
max_depth = [3,4,5]

In [28]:
param_dist = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
}

In [31]:
rf_tuned = RandomForestClassifier()
rf_cv = RandomizedSearchCV(rf_tuned, param_dist, n_iter=20, cv=5)
#rf_cv.fit(X, Y)
rf_cv.fit(X_train, Y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [32]:
rf_cv.best_params_

{'n_estimators': 20, 'max_depth': 5}

In [33]:
Y_pred_rf_best = rf_cv.predict(X_test)

In [34]:
print('Random Forest Classifier:')
print('Accuracy score:', round(accuracy_score(Y_test, Y_pred_rf_best) * 100, 2))
print('F1 score:', round(f1_score(Y_test, Y_pred_rf_best) * 100, 2))

Random Forest Classifier:
Accuracy score: 84.69
F1 score: 58.72


In [35]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred_rf_best))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      4976
           1       0.81      0.46      0.59      1537

    accuracy                           0.85      6513
   macro avg       0.83      0.71      0.75      6513
weighted avg       0.84      0.85      0.83      6513

