In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [3]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educationnum,maritalstatus,Occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.drop_duplicates(keep=False,inplace=True)

In [5]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [6]:
Y

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: salary, Length: 32514, dtype: object

In [7]:
## from this education-num have high correlation with hours-per-week so just take one feature for purpose..
X.drop(columns='hoursperweek',inplace=True)

In [8]:
numerical_columns = X.select_dtypes(exclude=['object']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [9]:
numerical_columns

Index(['age', 'fnlwgt', 'educationnum', 'capitalgain', 'capitalloss'], dtype='object')

In [10]:
categorical_columns

Index(['workclass', 'education', 'maritalstatus', 'Occupation', 'relationship',
       'race', 'sex', 'country'],
      dtype='object')

In [11]:
## ENCODING..
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [12]:
for column_name, dtype in X.dtypes.items():
    if dtype == 'object':
        unique_values = ', '.join(X[column_name].unique())
        print('{}: {}'.format(column_name, unique_values))

workclass:  State-gov,  Self-emp-not-inc,  Private,  Federal-gov,  Local-gov,  ?,  Self-emp-inc,  Without-pay,  Never-worked
education:  Bachelors,  HS-grad,  11th,  Masters,  9th,  Some-college,  Assoc-acdm,  Assoc-voc,  7th-8th,  Doctorate,  Prof-school,  5th-6th,  10th,  1st-4th,  Preschool,  12th
maritalstatus:  Never-married,  Married-civ-spouse,  Divorced,  Married-spouse-absent,  Separated,  Married-AF-spouse,  Widowed
Occupation:  Adm-clerical,  Exec-managerial,  Handlers-cleaners,  Prof-specialty,  Other-service,  Sales,  Craft-repair,  Transport-moving,  Farming-fishing,  Machine-op-inspct,  Tech-support,  ?,  Protective-serv,  Armed-Forces,  Priv-house-serv
relationship:  Not-in-family,  Husband,  Wife,  Own-child,  Unmarried,  Other-relative
race:  White,  Black,  Asian-Pac-Islander,  Amer-Indian-Eskimo,  Other
sex:  Male,  Female
country:  United-States,  Cuba,  Jamaica,  India,  ?,  Mexico,  South,  Puerto-Rico,  Honduras,  England,  Canada,  Germany,  Iran,  Philippines,

In [13]:
encoded_x = ohe.fit_transform(X[categorical_columns]).toarray()
ohe.categories_

[array([' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private',
        ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay'],
       dtype=object),
 array([' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th',
        ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate',
        ' HS-grad', ' Masters', ' Preschool', ' Prof-school',
        ' Some-college'], dtype=object),
 array([' Divorced', ' Married-AF-spouse', ' Married-civ-spouse',
        ' Married-spouse-absent', ' Never-married', ' Separated',
        ' Widowed'], dtype=object),
 array([' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair',
        ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners',
        ' Machine-op-inspct', ' Other-service', ' Priv-house-serv',
        ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support',
        ' Transport-moving'], dtype=object),
 array([' Husband', ' Not-in-family', ' Other-relative', ' Own-child',
        ' Unmarried'

In [21]:
feature_labels = ohe.categories_
## .ravel() used to change 2 dimensional array to flattened multi dimensional array
# Flatten each sub-array and concatenate them into a single arra
# Convert the flattened list to a NumPy array

In [22]:
all_values = [label for labels in feature_labels for label in labels]
print(all_values)

[' ?', ' Federal-gov', ' Local-gov', ' Never-worked', ' Private', ' Self-emp-inc', ' Self-emp-not-inc', ' State-gov', ' Without-pay', ' 10th', ' 11th', ' 12th', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' HS-grad', ' Masters', ' Preschool', ' Prof-school', ' Some-college', ' Divorced', ' Married-AF-spouse', ' Married-civ-spouse', ' Married-spouse-absent', ' Never-married', ' Separated', ' Widowed', ' ?', ' Adm-clerical', ' Armed-Forces', ' Craft-repair', ' Exec-managerial', ' Farming-fishing', ' Handlers-cleaners', ' Machine-op-inspct', ' Other-service', ' Priv-house-serv', ' Prof-specialty', ' Protective-serv', ' Sales', ' Tech-support', ' Transport-moving', ' Husband', ' Not-in-family', ' Other-relative', ' Own-child', ' Unmarried', ' Wife', ' Amer-Indian-Eskimo', ' Asian-Pac-Islander', ' Black', ' Other', ' White', ' Female', ' Male', ' ?', ' Cambodia', ' Canada', ' China', ' Columbia', ' Cuba', ' Dominican-Republic', ' Ecua

In [23]:
categorical_encoded = pd.DataFrame(encoded_x,columns=all_values)
categorical_encoded

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,10th,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32509,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32510,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32511,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
32512,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
## standardization..
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_columns])

In [None]:
features =[]
for feature in X[numerical_columns].dtypes.items():
    features.append(feature[0])

In [None]:
numerical_standard = pd.DataFrame(X_scaled,columns=features)
numerical_standard

Unnamed: 0,age,fnlwgt,educationnum,capitalgain,capitalloss
0,0.030139,-1.063545,1.134758,0.148137,-0.216823
1,0.836868,-1.008647,1.134758,-0.146028,-0.216823
2,-0.043199,0.245003,-0.421145,-0.146028,-0.216823
3,1.056885,0.425707,-1.199096,-0.146028,-0.216823
4,-0.776589,1.407975,1.134758,-0.146028,-0.216823
...,...,...,...,...,...
32509,-0.849928,0.639623,0.745782,-0.146028,-0.216823
32510,0.103478,-0.335445,-0.421145,-0.146028,-0.216823
32511,1.423580,-0.358788,-0.421145,-0.146028,-0.216823
32512,-1.216623,0.110899,-0.421145,-0.146028,-0.216823


In [None]:
categorical_encoded['age'] = numerical_standard['age']
categorical_encoded['fnlwgt'] = numerical_standard['fnlwgt']
categorical_encoded['educationnum'] = numerical_standard['educationnum']
categorical_encoded['capitalgain'] = numerical_standard['capitalgain']
categorical_encoded['capitalloss'] = numerical_standard['capitalloss']
categorical_encoded

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,10th,...,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,age,fnlwgt,educationnum,capitalgain,capitalloss
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.030139,-1.063545,1.134758,0.148137,-0.216823
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.836868,-1.008647,1.134758,-0.146028,-0.216823
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.043199,0.245003,-0.421145,-0.146028,-0.216823
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.056885,0.425707,-1.199096,-0.146028,-0.216823
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.776589,1.407975,1.134758,-0.146028,-0.216823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32509,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-0.849928,0.639623,0.745782,-0.146028,-0.216823
32510,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.103478,-0.335445,-0.421145,-0.146028,-0.216823
32511,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.423580,-0.358788,-0.421145,-0.146028,-0.216823
32512,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-1.216623,0.110899,-0.421145,-0.146028,-0.216823


In [None]:
## The depedent feature only have 2 category of features so no need for encoding..

In [None]:
Y = np.array(Y)
Y

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [None]:
Y = Y.reshape(-1,1)
ohe = OneHotEncoder(sparse=False)
encoded_Y = ohe.fit_transform(Y)



In [None]:
encoded_Y = pd.DataFrame(encoded_Y)
encoded_Y = encoded_Y.rename(columns={0:'<=50K',1:'>50K'})
encoded_Y

Unnamed: 0,<=50K,>50K
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
32509,1.0,0.0
32510,0.0,1.0
32511,1.0,0.0
32512,1.0,0.0


In [None]:
categorical_encoded['<=50K'] = encoded_Y['<=50K']
categorical_encoded['>50K'] = encoded_Y['>50K']

In [None]:
## The overall output of the preprocessed model..
categorical_encoded

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,10th,...,United-States,Vietnam,Yugoslavia,age,fnlwgt,educationnum,capitalgain,capitalloss,<=50K,>50K
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.030139,-1.063545,1.134758,0.148137,-0.216823,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.836868,-1.008647,1.134758,-0.146028,-0.216823,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.043199,0.245003,-0.421145,-0.146028,-0.216823,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.056885,0.425707,-1.199096,-0.146028,-0.216823,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.776589,1.407975,1.134758,-0.146028,-0.216823,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32509,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.849928,0.639623,0.745782,-0.146028,-0.216823,1.0,0.0
32510,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.103478,-0.335445,-0.421145,-0.146028,-0.216823,0.0,1.0
32511,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.423580,-0.358788,-0.421145,-0.146028,-0.216823,1.0,0.0
32512,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-1.216623,0.110899,-0.421145,-0.146028,-0.216823,1.0,0.0


In [None]:
X = categorical_encoded.iloc[:,:-2]
Y = categorical_encoded.iloc[:,-2:]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((22759, 107), (22759, 2), (9755, 107), (9755, 2))

In [None]:
y_test

Unnamed: 0,<=50K,>50K
4421,1.0,0.0
821,1.0,0.0
10556,1.0,0.0
18536,1.0,0.0
16768,1.0,0.0
...,...,...
27610,1.0,0.0
29929,1.0,0.0
8648,1.0,0.0
10718,1.0,0.0


In [None]:
y_train = y_train['<=50K'].values
y_test = y_test['<=50K'].values

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Adaboost": AdaBoostClassifier()
}

model_list = []
r2_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    ## testing set performence
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test,y_test_pred,average='weighted')
    model_test_precision = precision_score(y_test,y_test_pred)
    model_test_recall = recall_score(y_test,y_test_pred)
    model_test_roc_score = roc_auc_score(y_test,y_test_pred)
    
    
    ## below code is used to extraxt all models that we used in model building and put it into list..
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model Performance of Test data :')
    print('Model Accuracy Score is:{:.4f}'.format(model_test_accuracy))
    print('Model F1 Score is :{:.4f}'.format(model_test_f1))
    print('Model Precision Score is :{:.4f}'.format(model_test_precision))
    print('Model recall Score is :{:.4f}'.format(model_test_recall))
    print('Model ROC Score is :{:.4f}'.format(model_test_roc_score))
    
                   
    print('='*30)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Model Performance of Test data :
Model Accuracy Score is:0.8517
Model F1 Score is :0.8456
Model Precision Score is :0.8784
Model recall Score is :0.9344
Model ROC Score is :0.7613


Decision Tree
Model Performance of Test data :
Model Accuracy Score is:0.8115
Model F1 Score is :0.8125
Model Precision Score is :0.8802
Model recall Score is :0.8708
Model ROC Score is :0.7467


Random Forest
Model Performance of Test data :
Model Accuracy Score is:0.8479
Model F1 Score is :0.8449
Model Precision Score is :0.8873
Model recall Score is :0.9165
Model ROC Score is :0.7729


Adaboost
Model Performance of Test data :
Model Accuracy Score is:0.8612
Model F1 Score is :0.8551
Model Precision Score is :0.8829
Model recall Score is :0.9426
Model ROC Score is :0.7723


