## <span style='color:red '>1.0 Importing required libraries</span>

In [None]:
### Pandas and Numpy
import pandas as pd
import numpy as np

### MongoDB Library
import pymongo

### Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### To ignore warnings
import warnings
warnings.filterwarnings('ignore')

## <span style='color:red '>2.0 Retrieving data from MongoDB</span>

In [2]:
### Retriving data from Mongodb
### creating connection with MongoDB

client = pymongo.MongoClient("mongodb+srv://{username}:{password}@clustershub.jujlbeo.mongodb.net/?retryWrites=true&w=majority")

In [3]:
db=client['Census_income']
collection=db['Census_income_data']

In [4]:
### Locating our collection and data in MongoDb using find() method
data_from_mongodb=collection.find()

In [5]:
### converting data from MongoDb to Dataframe in pandas
data_mongodb=pd.DataFrame(data_from_mongodb)

In [6]:
### first 5 records in dataset
data_mongodb.head()

Unnamed: 0,_id,index,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,63635b0506652a035edadc4c,31,20,Private,266015,Some_college,10,Never_married,Sales,Own_child,other,Male,0,0,44,United_States,0
1,63635b0506652a035edadc2d,0,39,other,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,0
2,63635b0506652a035edadc3a,13,32,Private,205019,other,12,Never_married,Sales,Not_in_family,other,Male,0,0,50,United_States,0
3,63635b0506652a035edadc3d,16,25,other,176756,HS_grad,9,Never_married,other,Own_child,White,Male,0,0,35,United_States,0
4,63635b0506652a035edadc2e,1,50,other,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,0


In [7]:
### dropping _id and index feature from dataset imported from MongoDB
data_mongodb.drop(['_id','index'], axis=1, inplace=True)
data_mongodb.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
0,20,Private,266015,Some_college,10,Never_married,Sales,Own_child,other,Male,0,0,44,United_States,0
1,39,other,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States,0
2,32,Private,205019,other,12,Never_married,Sales,Not_in_family,other,Male,0,0,50,United_States,0
3,25,other,176756,HS_grad,9,Never_married,other,Own_child,White,Male,0,0,35,United_States,0
4,50,other,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States,0


## <span style='color:red '>3.0 Model and Evaluation</span>

### <span style='color:red '>3.1 Seperating Independent and Dependent features</span>

In [8]:
### Splitting data into independent feature dataframe and dependent feature series
X=data_mongodb.iloc[:,:-1]
y=data_mongodb.iloc[:,-1]
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,20,Private,266015,Some_college,10,Never_married,Sales,Own_child,other,Male,0,0,44,United_States
1,39,other,77516,Bachelors,13,Never_married,Adm_clerical,Not_in_family,White,Male,2174,0,40,United_States
2,32,Private,205019,other,12,Never_married,Sales,Not_in_family,other,Male,0,0,50,United_States
3,25,other,176756,HS_grad,9,Never_married,other,Own_child,White,Male,0,0,35,United_States
4,50,other,83311,Bachelors,13,Married_civ_spouse,Exec_managerial,Husband,White,Male,0,0,13,United_States


In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: int64

### <span style='color:red '>3.2 Train Test Split</span>

In [10]:
### random state train test split will be same with all people using random_state=19

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=19)
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
34576,30,Private,23778,Some_college,10,Never_married,Exec_managerial,Not_in_family,White,Male,4416,0,40,United_States
33148,41,Private,112763,Some_college,10,Married_civ_spouse,Adm_clerical,other,White,Female,0,0,35,United_States
2109,20,Private,241752,HS_grad,9,Married_civ_spouse,other,Husband,White,Male,0,0,40,United_States
33501,35,Private,211494,Bachelors,13,Never_married,Exec_managerial,Not_in_family,White,Male,0,1876,55,United_States
47110,24,Private,408585,other,4,Married_civ_spouse,other,Own_child,White,Female,0,0,45,other


In [11]:
y_train.head()

34576    0
33148    0
2109     0
33501    0
47110    0
Name: salary, dtype: int64

In [12]:
X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
3436,52,Private,48925,Some_college,10,Married_civ_spouse,Adm_clerical,Husband,White,Male,0,0,40,United_States
16332,27,Private,31659,Bachelors,13,Married_civ_spouse,other,Husband,White,Male,0,1887,60,United_States
39798,67,Private,187553,other,4,Divorced,Prof_specialty,Not_in_family,White,Male,0,0,40,United_States
12405,45,other,255559,HS_grad,9,Never_married,Adm_clerical,Not_in_family,White,Female,0,0,40,United_States
7584,32,Private,169955,Some_college,10,Married_civ_spouse,Other_service,other,White,Female,0,0,36,other


In [13]:
y_test.head()

3436     0
16332    1
39798    0
12405    0
7584     0
Name: salary, dtype: int64

In [14]:

### both will have same shape
X_train.shape, y_train.shape

((36609, 14), (36609,))

In [15]:
### both will have same shape
X_test.shape, y_test.shape

((12204, 14), (12204,))

### <span style='color:red '>3.3 Feature Encoding</span>

In [16]:
column_trans=make_column_transformer(
        (OneHotEncoder(), ['workclass','education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']), 
        remainder='passthrough')

In [17]:
X_train=column_trans.fit_transform(X_train)

In [18]:
X_test=column_trans.transform(X_test)

### <span style='color:red '>3.4 Feature Scaling</span>

In [19]:
scaler=StandardScaler()

In [20]:
X_train=scaler.fit_transform(X_train)

In [21]:
X_test=scaler.transform(X_test)

In [22]:
X_train_scaled=pd.DataFrame(X_train)
X_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.573765,-0.573765,-0.444065,-0.691388,1.869548,-0.638185,-0.394399,-0.923205,1.426647,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,-0.630924,-1.576581,-0.025555,0.461589,-0.21626,-0.036322
1,0.573765,-0.573765,-0.444065,-0.691388,1.869548,-0.638185,-0.394399,1.083183,-0.700944,-0.286388,...,1.426647,-1.426647,0.305443,-0.305443,0.171508,-0.730881,-0.025555,-0.144427,-0.21626,-0.441398
2,0.573765,-0.573765,-0.444065,1.446367,-0.534889,-0.638185,-0.394399,1.083183,-0.700944,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,-1.360408,0.495011,-0.414708,-0.144427,-0.21626,-0.036322
3,0.573765,-0.573765,2.25192,-0.691388,-0.534889,-0.638185,-0.394399,-0.923205,1.426647,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,-0.266182,0.207443,1.141907,-0.144427,4.437052,1.178908
4,0.573765,-0.573765,-0.444065,-0.691388,-0.534889,1.566943,-0.394399,1.083183,-0.700944,-0.286388,...,1.426647,-1.426647,-3.273932,3.273932,-1.068614,2.080566,-2.360478,-0.144427,-0.21626,0.368755


In [23]:
X_test_scaled=pd.DataFrame(X_test)
X_test_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.573765,-0.573765,-0.444065,-0.691388,1.869548,-0.638185,-0.394399,1.083183,-0.700944,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,0.97394,-1.337588,-0.025555,-0.144427,-0.21626,-0.036322
1,0.573765,-0.573765,2.25192,-0.691388,-0.534889,-0.638185,-0.394399,1.083183,-0.700944,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,-0.849769,-1.501681,1.141907,-0.144427,4.464337,1.583984
2,0.573765,-0.573765,-0.444065,-0.691388,-0.534889,1.566943,2.535503,-0.923205,-0.700944,-0.286388,...,-0.700944,0.700944,0.305443,-0.305443,2.068166,-0.020088,-2.360478,-0.144427,-0.21626,-0.036322
3,-1.742874,1.742874,-0.444065,1.446367,-0.534889,-0.638185,-0.394399,-0.923205,1.426647,-0.286388,...,1.426647,-1.426647,0.305443,-0.305443,0.463302,0.62623,-0.414708,-0.144427,-0.21626,-0.036322
4,0.573765,-0.573765,-0.444065,-0.691388,1.869548,-0.638185,-0.394399,1.083183,-0.700944,-0.286388,...,1.426647,-1.426647,-3.273932,3.273932,-0.485027,-0.187337,-0.025555,-0.144427,-0.21626,-0.360383


### <span style='color:red '>3.5 Logestic Regression Model</span>

In [24]:
### model
logistic_reg=LogisticRegression()
logistic_reg

LogisticRegression()

In [25]:
logistic_reg.fit(X_train, y_train)

LogisticRegression()

In [26]:
logistic_reg_pred=logistic_reg.predict(X_test)
logistic_reg_pred

array([0, 1, 0, ..., 0, 0, 0])

In [27]:
confusion_mat=confusion_matrix(y_test, logistic_reg_pred)
confusion_mat

array([[8618,  644],
       [1175, 1767]])

In [28]:
truly_positive=confusion_mat[0][0]
falsely_positive=confusion_mat[0][1]
falsely_negative=confusion_mat[1][0]
truly_negative=confusion_mat[1][1]

In [29]:
classification_rep_log_reg=classification_report(y_test, logistic_reg_pred)
print(classification_rep_log_reg)

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      9262
           1       0.73      0.60      0.66      2942

    accuracy                           0.85     12204
   macro avg       0.81      0.77      0.78     12204
weighted avg       0.84      0.85      0.85     12204



### <span style='color:red '>3.6 Support Vector Classifier Model</span>

In [30]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.25, random_state=19)

In [31]:
column_trans_svc=make_column_transformer(
        (OneHotEncoder(), ['workclass','education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']), 
        remainder='passthrough')

In [32]:
X_train1=column_trans_svc.fit_transform(X_train1)

In [33]:
X_test1=column_trans_svc.transform(X_test1)

In [34]:
scaler_svc=StandardScaler()
scaler_svc

StandardScaler()

In [35]:
X_train1=scaler_svc.fit_transform(X_train1)

In [36]:
X_test1=scaler_svc.transform(X_test1)

In [None]:
svc=SVC()
svc

SVC()

In [None]:
svc.fit(X_train1, y_train1)

SVC()

In [None]:
svc_pred=svc.predict(X_test1)
svc_pred

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
confusion_mat_svc=confusion_matrix(y_test1, svc_pred)
confusion_mat_svc

array([[8718,  544],
       [1256, 1686]])

In [None]:
truly_positive=confusion_mat_svc[0][0]
falsely_positive=confusion_mat_svc[0][1]
falsely_negative=confusion_mat_svc[1][0]
truly_negative=confusion_mat_svc[1][1]

In [None]:
classification_rep_svc=classification_report(y_test1, svc_pred)
print(classification_rep_svc)

              precision    recall  f1-score   support

           0       0.87      0.94      0.91      9262
           1       0.76      0.57      0.65      2942

    accuracy                           0.85     12204
   macro avg       0.82      0.76      0.78     12204
weighted avg       0.85      0.85      0.85     12204



### <span style='color:red '>3.7 Hyper-Parameter Tuning Logistic Regression Model</span>

In [None]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 5),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 500]
    }
]

In [None]:
log_reg_hpt=LogisticRegression()
log_reg_hpt

LogisticRegression()

In [None]:
hpt_log_reg=GridSearchCV(log_reg_hpt, param_grid = param_grid)

In [None]:
best_hpt_log_reg=hpt_log_reg.fit(X_train, y_train)
best_hpt_log_reg

GridSearchCV(estimator=LogisticRegression(),
             param_grid=[{'C': array([1.e-04, 1.e-02, 1.e+00, 1.e+02, 1.e+04]),
                          'max_iter': [100, 500],
                          'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                          'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag',
                                     'saga']}])

In [None]:
### getting best parameters for Logistic Regression model after gridsearchCV
print("Best parameters are {} for optimal accuracy.".format(best_hpt_log_reg.best_estimator_))

Best parameters are LogisticRegression(C=0.01, penalty='l1', solver='liblinear') for optimal accuracy.


In [None]:
### getting best accuracy for Logistic Regression model after gridsearchCV
print("Best accuracy is {}".format(best_hpt_log_reg.score(X_test, y_test)))

Best accuracy is 0.8504588659455916


### <span style='color:red '>3.8 Hyper-Parameter Tuning Support Vector Classifier Model</span>

In [37]:
svc_hpt=SVC()
svc_hpt

SVC()

In [38]:
#### using gridsearchcv to increase model efficiency by combining above parameters
param_grid={'C':[1,2,3], 'kernel':['rbf']}
hpt_svc=GridSearchCV(svc_hpt, param_grid=param_grid)

In [39]:
best_hpt_svc=hpt_svc.fit(X_train1, y_train1)
best_hpt_svc

GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 20], 'degree': [2, 3],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']})

In [40]:
### getting best parameters for Logistic Regression model after gridsearchCV
print("Best parameters are {} for optimal accuracy.".format(best_hpt_svc.best_estimator_))

Best parameters are SVC(C=20, degree=2, kernel='poly') for optimal accuracy.


In [41]:
### getting best accuracy for Logistic Regression model after gridsearchCV
print("Best accuracy is {}".format(best_hpt_svc.score(X_test, y_test)))

Best accuracy is 0.8491478203867584


In [42]:
svc_hpt1=SVC()
svc_hpt1

SVC()

In [43]:
#### using gridsearchcv to increase model efficiency by combining above parameters
param_grid1={'C':[1,2,3], 'kernel':['rbf']}
hpt_svc1=GridSearchCV(svc_hpt1, param_grid=param_grid1)

In [44]:
best_hpt_svc1=hpt_svc1.fit(X_train1, y_train1)
best_hpt_svc1

GridSearchCV(estimator=SVC(), param_grid={'C': [1, 2, 3], 'kernel': ['rbf']})

In [45]:
### getting best parameters for Logistic Regression model after gridsearchCV
print("Best parameters are {} for optimal accuracy.".format(best_hpt_svc1.best_estimator_))

Best parameters are SVC(C=2) for optimal accuracy.


In [47]:
### getting best accuracy for Logistic Regression model after gridsearchCV
print("Best accuracy is {}".format(best_hpt_svc1.score(X_test1, y_test1)))

Best accuracy is 0.8517699115044248


**Note: Please refer my github repo for ROC AUC curve implementation**