In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv("adult.csv")

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,#NAME?,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43,Private,222971,5th-6th,3,Never-married,Machine-op-inspct,Unmarried,White,Female,0,0,40,Mexico,<=50K
4996,31,Private,259425,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4997,47,Self-emp-inc,212120,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
4998,#NAME?,Private,245880,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,60,United-States,<=50K


In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             5000 non-null   object
 1   workclass       5000 non-null   object
 2   fnlwgt          5000 non-null   object
 3   education       5000 non-null   object
 4   education_num   5000 non-null   object
 5   marital_status  5000 non-null   object
 6   occupation      5000 non-null   object
 7   relationship    5000 non-null   object
 8   race            5000 non-null   object
 9   sex             5000 non-null   object
 10  capital_gain    5000 non-null   int64 
 11  capital_loss    5000 non-null   int64 
 12  hours_per_week  5000 non-null   int64 
 13  native_country  5000 non-null   object
 14  income          5000 non-null   object
dtypes: int64(3), object(12)
memory usage: 586.1+ KB


In [6]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,capital_gain,capital_loss,hours_per_week
count,5000.0,5000.0,5000.0
mean,1033.6402,93.6968,40.519
std,7051.802077,410.801418,12.109193
min,0.0,0.0,1.0
25%,0.0,0.0,40.0
50%,0.0,0.0,40.0
75%,0.0,0.0,45.0
max,99999.0,2547.0,99.0


In [8]:
df1 = df.drop(["relationship","native_country","race","marital_status"],axis=1)

In [9]:
df1

Unnamed: 0,age,workclass,fnlwgt,education,education_num,occupation,sex,capital_gain,capital_loss,hours_per_week,income
0,39,State-gov,77516,Bachelors,13,Adm-clerical,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Exec-managerial,Male,0,0,13,<=50K
2,38,Private,215646,HS-grad,9,Handlers-cleaners,Male,0,0,40,<=50K
3,53,Private,234721,11th,7,Handlers-cleaners,#NAME?,0,0,40,<=50K
4,28,Private,338409,Bachelors,13,Prof-specialty,Female,0,0,40,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
4995,43,Private,222971,5th-6th,3,Machine-op-inspct,Female,0,0,40,<=50K
4996,31,Private,259425,HS-grad,9,Craft-repair,Male,0,0,40,>50K
4997,47,Self-emp-inc,212120,HS-grad,9,Craft-repair,Male,0,0,40,>50K
4998,#NAME?,Private,245880,HS-grad,9,Adm-clerical,Male,0,0,60,<=50K


In [10]:
df1.describe()

Unnamed: 0,capital_gain,capital_loss,hours_per_week
count,5000.0,5000.0,5000.0
mean,1033.6402,93.6968,40.519
std,7051.802077,410.801418,12.109193
min,0.0,0.0,1.0
25%,0.0,0.0,40.0
50%,0.0,0.0,40.0
75%,0.0,0.0,45.0
max,99999.0,2547.0,99.0


In [None]:
#Converting age and fnlwgt columns to float to fill mean values in Null value places

In [11]:
df1["age"] = df1["age"].replace({"#NAME?":np.nan})

In [12]:
df1["age"] = df1[["age"]].astype(np.float)

In [13]:
df1["age"]=df1.groupby("workclass")["age"].transform(lambda x:x.fillna(x.mean()))

In [14]:
df1["fnlwgt"].value_counts()

#NAME?    107
123983      5
163003      4
155343      4
111567      4
         ... 
36425       1
379066      1
217826      1
51170       1
80145       1
Name: fnlwgt, Length: 4507, dtype: int64

In [15]:
df1["fnlwgt"] = df1["fnlwgt"].replace({"#NAME?":np.nan})

In [16]:
df1["fnlwgt"] = df1["fnlwgt"].astype(np.float)

In [17]:
df1["fnlwgt"]=df1.groupby("age")["fnlwgt"].transform(lambda x:x.fillna(x.mean()))

In [18]:
df1["sex"].value_counts()

Male      3332
Female    1621
#NAME?      47
Name: sex, dtype: int64

In [None]:
#Filling other in null places as it is category type columns

In [19]:
df1["sex"] = df1["sex"].replace({"#NAME?":"other"})

In [20]:
df1["sex"].value_counts()

Male      3332
Female    1621
other       47
Name: sex, dtype: int64

In [21]:
df1["sex"].value_counts()

Male      3332
Female    1621
other       47
Name: sex, dtype: int64

In [22]:
df1["workclass"] = df1["workclass"].replace({"?":["Other"]})

In [23]:
df1["workclass"].value_counts()

Private             3435
Self-emp-not-inc     383
Other                331
Local-gov            329
State-gov            193
Self-emp-inc         182
Federal-gov          146
Without-pay            1
Name: workclass, dtype: int64

In [24]:
df1["education"].value_counts()

HS-grad         1597
Some-college    1114
Bachelors        819
Masters          251
Assoc-voc        215
11th             198
Assoc-acdm       163
10th             144
7th-8th           98
Prof-school       88
9th               74
?                 57
12th              55
Doctorate         55
5th-6th           43
1st-4th           22
Preschool          7
Name: education, dtype: int64

In [25]:
df1["education"] = df1["education"].replace({"?":["Other"]})

In [26]:
df1["occupation"].value_counts()

Prof-specialty       625
Craft-repair         619
Exec-managerial      618
Sales                588
Adm-clerical         576
Other-service        495
?                    331
Machine-op-inspct    312
Transport-moving     247
Handlers-cleaners    196
Farming-fishing      143
Tech-support         140
Protective-serv       90
Priv-house-serv       18
Armed-Forces           2
Name: occupation, dtype: int64

In [27]:
df1["occupation"] = df1["occupation"].replace({"?":["Other"]})

In [28]:
df1["education_num"].value_counts()

9         1597
10        1114
13         819
14         251
11         215
7          198
12         163
6          144
4           98
15          88
5           74
#NAME?      57
16          55
8           55
3           43
2           22
1            7
Name: education_num, dtype: int64

In [29]:
df1["education_num"] = df1["education_num"].replace({"#NAME?":np.nan})

In [30]:
df1["education_num"] = df1["education_num"].fillna(0)

In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             5000 non-null   float64
 1   workclass       5000 non-null   object 
 2   fnlwgt          5000 non-null   float64
 3   education       5000 non-null   object 
 4   education_num   5000 non-null   object 
 5   occupation      5000 non-null   object 
 6   sex             5000 non-null   object 
 7   capital_gain    5000 non-null   int64  
 8   capital_loss    5000 non-null   int64  
 9   hours_per_week  5000 non-null   int64  
 10  income          5000 non-null   object 
dtypes: float64(2), int64(3), object(6)
memory usage: 429.8+ KB


In [32]:
df1.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
occupation        0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
income            0
dtype: int64

In [33]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             5000 non-null   float64
 1   workclass       5000 non-null   object 
 2   fnlwgt          5000 non-null   float64
 3   education       5000 non-null   object 
 4   education_num   5000 non-null   object 
 5   occupation      5000 non-null   object 
 6   sex             5000 non-null   object 
 7   capital_gain    5000 non-null   int64  
 8   capital_loss    5000 non-null   int64  
 9   hours_per_week  5000 non-null   int64  
 10  income          5000 non-null   object 
dtypes: float64(2), int64(3), object(6)
memory usage: 429.8+ KB


In [34]:
df1.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'occupation', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
       'income'],
      dtype='object')

In [35]:
columns_to_encode = ["workclass","occupation","education","sex"]

In [36]:
encoded_df = pd.get_dummies(df1[columns_to_encode])

In [37]:
modelling_df = pd.concat([df1.drop(["workclass","occupation","education","sex"],axis=1),encoded_df],axis=1)

In [38]:
modelling_df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Other,...,education_Doctorate,education_HS-grad,education_Masters,education_Other,education_Preschool,education_Prof-school,education_Some-college,sex_Female,sex_Male,sex_other
0,39.000000,77516.0,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,50.000000,83311.0,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,38.000000,215646.0,9,0,0,40,<=50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,53.000000,234721.0,7,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28.000000,338409.0,13,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43.000000,222971.0,3,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4996,31.000000,259425.0,9,0,0,40,>50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4997,47.000000,212120.0,9,0,0,40,>50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4998,36.948824,245880.0,9,0,0,60,<=50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [39]:
modelling_df["age"] = modelling_df["age"].astype("int64")

In [40]:
modelling_df["fnlwgt"] = modelling_df["fnlwgt"].astype("int64")

In [41]:
modelling_df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Other,...,education_Doctorate,education_HS-grad,education_Masters,education_Other,education_Preschool,education_Prof-school,education_Some-college,sex_Female,sex_Male,sex_other
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,50,83311,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,38,215646,9,0,0,40,<=50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,53,234721,7,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,28,338409,13,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,43,222971,3,0,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4996,31,259425,9,0,0,40,>50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4997,47,212120,9,0,0,40,>50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4998,36,245880,9,0,0,60,<=50K,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [42]:
#Logistic Regression

In [43]:
modelling_df.shape

(5000, 50)

In [44]:
dfL = modelling_df[:4000]
validationL = modelling_df[4000:]

In [45]:
X = dfL.drop("income",axis=1)
Y = dfL["income"]

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(
     X, Y, test_size=0.2, random_state=101)

In [48]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [49]:
log = LogisticRegression()

In [50]:
log.fit(X_train,Y_train)
predict = log.predict(X_test)

In [51]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,roc_auc_score

In [52]:
print(accuracy_score(Y_test,predict))
print(confusion_matrix(Y_test,predict))
print(classification_report(Y_test,predict))

0.78125
[[573  20]
 [155  52]]
              precision    recall  f1-score   support

       <=50K       0.79      0.97      0.87       593
        >50K       0.72      0.25      0.37       207

    accuracy                           0.78       800
   macro avg       0.75      0.61      0.62       800
weighted avg       0.77      0.78      0.74       800



In [53]:
modelling_df["income"].value_counts()

<=50K    3779
>50K     1221
Name: income, dtype: int64

In [54]:
validation_X = validationL.drop("income",axis=1)
validation_Y = validationL["income"]

In [55]:
predict_valid = log.predict(validation_X)

In [56]:
print(accuracy_score(validation_Y,predict_valid))
print(confusion_matrix(validation_Y,predict_valid))
print(classification_report(validation_Y,predict_valid))

0.809
[[740  23]
 [168  69]]
              precision    recall  f1-score   support

       <=50K       0.81      0.97      0.89       763
        >50K       0.75      0.29      0.42       237

    accuracy                           0.81      1000
   macro avg       0.78      0.63      0.65      1000
weighted avg       0.80      0.81      0.78      1000



In [57]:
#Decision tree and Random forest

In [58]:
X1 = modelling_df.drop("income",axis=1)
Y1 = modelling_df["income"]

In [59]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2, random_state=101)

In [63]:
def fit_predict(train, test, Y1_train, Y1_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, Y1_train)
    Y1_pred = dt.predict(test_scaled)
    print(accuracy_score(Y1_test, Y1_pred))

In [65]:
tree = DecisionTreeClassifier()
tree.fit(X1_train, Y1_train)
Y1_pred = tree.predict(X1_test)
print(accuracy_score(Y1_test, Y1_pred))

0.76


In [None]:
#MAX Depth Tuning

In [66]:
max_depth = []
for i in range(1, 30):
    print("Accuracy using Max Depth =", i, end = ': ')
    fit_predict(X1_train, X1_test, Y1_train, Y1_test, StandardScaler(), i)

#Print("Max Depth:",max(max_depth))

Accuracy using Max Depth = 1: 0.761
Accuracy using Max Depth = 2: 0.766
Accuracy using Max Depth = 3: 0.764
Accuracy using Max Depth = 4: 0.773
Accuracy using Max Depth = 5: 0.768
Accuracy using Max Depth = 6: 0.769
Accuracy using Max Depth = 7: 0.774
Accuracy using Max Depth = 8: 0.769
Accuracy using Max Depth = 9: 0.771
Accuracy using Max Depth = 10: 0.764
Accuracy using Max Depth = 11: 0.763
Accuracy using Max Depth = 12: 0.763
Accuracy using Max Depth = 13: 0.769
Accuracy using Max Depth = 14: 0.753
Accuracy using Max Depth = 15: 0.764
Accuracy using Max Depth = 16: 0.763
Accuracy using Max Depth = 17: 0.765
Accuracy using Max Depth = 18: 0.769
Accuracy using Max Depth = 19: 0.776
Accuracy using Max Depth = 20: 0.759
Accuracy using Max Depth = 21: 0.748
Accuracy using Max Depth = 22: 0.754
Accuracy using Max Depth = 23: 0.766
Accuracy using Max Depth = 24: 0.749
Accuracy using Max Depth = 25: 0.772
Accuracy using Max Depth = 26: 0.783
Accuracy using Max Depth = 27: 0.761
Accuracy u

In [None]:
#MAX Depth here is 26

In [None]:
#MAX Features split

In [67]:
for i in np.arange(0.1, 1.0, 0.1):
    print("Accuracy using Max Feature =", i, end = ': ')
    fit_predict(X1_train, X1_test, Y1_train, Y1_test, StandardScaler(), max_depth = 26, max_features= i)

Accuracy using Max Feature = 0.1: 0.774
Accuracy using Max Feature = 0.2: 0.786
Accuracy using Max Feature = 0.30000000000000004: 0.783
Accuracy using Max Feature = 0.4: 0.764
Accuracy using Max Feature = 0.5: 0.773
Accuracy using Max Feature = 0.6: 0.772
Accuracy using Max Feature = 0.7000000000000001: 0.768
Accuracy using Max Feature = 0.8: 0.756
Accuracy using Max Feature = 0.9: 0.763


In [None]:
#MAX feature here is 0.2

In [None]:
#MIN Sample spit

In [68]:
for i in range(2, 10):
    print("Accuracy using Min Sample Split =", i, end = ': ')
    fit_predict(X1_train, X1_test, Y1_train, Y1_test, StandardScaler(), max_depth = 26, min_samples_split= i)

Accuracy using Min Sample Split = 2: 0.78
Accuracy using Min Sample Split = 3: 0.756
Accuracy using Min Sample Split = 4: 0.783
Accuracy using Min Sample Split = 5: 0.759
Accuracy using Min Sample Split = 6: 0.758
Accuracy using Min Sample Split = 7: 0.732
Accuracy using Min Sample Split = 8: 0.783
Accuracy using Min Sample Split = 9: 0.774


In [None]:
#Criterion tuning

In [69]:
for i in ['gini', 'entropy']:
    print("Accuracy using Criterion =", i, end = ': ')
    fit_predict(X1_train, X1_test, Y1_train, Y1_test, StandardScaler(), max_depth = 26, 
                criterion = i)

Accuracy using Criterion = gini: 0.78
Accuracy using Criterion = entropy: 0.783


In [None]:
#Degree of polynomial

In [70]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [73]:
for degree in [1, 2, 3]:
    train_poly, test_poly = create_poly(X1_train, X1_test, degree)
    print('Polynomial Degree:', degree)
    fit_predict(train_poly, test_poly, Y1_train, Y1_test, StandardScaler(), max_depth = 26)
    print(10*'-')
    
train_poly, test_poly = create_poly(X1_train, X1_test, 2)

Polynomial Degree: 1
0.774
----------
Polynomial Degree: 2
0.76
----------
Polynomial Degree: 3
0.775
----------


In [74]:
train_poly, test_poly = create_poly(X1_train, X1_test, 2)

fit_predict(train_poly, test_poly, Y1_train, Y1_test, StandardScaler(), max_depth = 26)

0.76


In [None]:
#Random forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
rf = RandomForestClassifier()
rf.fit(X1_train, Y1_train)
Y1_pred = rf.predict(X1_test)
print("RandomForest Accuracy:",accuracy_score(Y1_test, Y1_pred))

RandomForest Accuracy: 0.804


In [77]:
rf.base_estimator

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [78]:
from sklearn.model_selection import GridSearchCV

In [79]:
params = {'n_estimators': [200,500,700], 'max_depth': [6,12,18,20,26], 'min_samples_leaf': [2, 3, 5]}

In [80]:
gsv = GridSearchCV(rf, params, verbose = 3)

In [81]:
gsv.fit(X1_train, Y1_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] max_depth=6, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=6, min_samples_leaf=2, n_estimators=200, score=0.843, total=   0.8s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  max_depth=6, min_samples_leaf=2, n_estimators=200, score=0.838, total=   0.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=200 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.4s remaining:    0.0s


[CV]  max_depth=6, min_samples_leaf=2, n_estimators=200, score=0.829, total=   0.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=200 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=200, score=0.839, total=   0.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=200 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=200, score=0.830, total=   0.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=500, score=0.841, total=   1.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=500, score=0.835, total=   1.7s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=500, score=0.828, total=   1.9s
[CV] max_depth=6, min_samples_leaf=2, n_estimators=500 ...............
[CV]  max_depth=6, min_samples_leaf=2, n_estimators=500, score=0.844, 

[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed:  7.8min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [83]:
gsv.best_params_

{'max_depth': 26, 'min_samples_leaf': 3, 'n_estimators': 700}

In [84]:
gsv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=26, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
rf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                            max_depth=26, max_features='auto', max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=3, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=700,n_jobs=None,
                            oob_score=True, random_state=101, verbose=0, warm_start=False)

In [86]:
rf1.fit(X1_train,Y1_train)
Y1_pred = rf1.predict(X1_test)
print("RandomForest Accuracy:",accuracy_score(Y1_test, Y1_pred))

RandomForest Accuracy: 0.831


In [87]:
rf1.oob_score_

0.83725

In [88]:
FI = rf1.feature_importances_

In [89]:
FI

array([1.85587175e-01, 9.00541449e-02, 1.15383714e-01, 1.76548446e-01,
       4.52947346e-02, 9.40418782e-02, 6.43592178e-03, 5.90923273e-03,
       3.01331567e-03, 1.08226574e-02, 1.40180001e-02, 7.08886022e-03,
       3.03029635e-03, 0.00000000e+00, 6.38565427e-03, 0.00000000e+00,
       6.92092657e-03, 4.43672192e-02, 4.15785282e-03, 2.46786704e-03,
       3.31214130e-03, 2.93191611e-03, 9.72356404e-03, 4.34013346e-05,
       2.02128534e-02, 2.69147666e-03, 9.55707413e-03, 4.76737012e-03,
       4.27574425e-03, 2.02766449e-03, 1.51048288e-03, 2.42293758e-04,
       1.34533523e-04, 3.33012849e-04, 1.67130966e-03, 6.98280183e-04,
       2.95137565e-03, 3.18606002e-03, 1.70794153e-02, 4.67107669e-03,
       7.88533221e-03, 9.92620294e-03, 1.81938601e-03, 1.95381119e-06,
       1.14284100e-02, 6.61285411e-03, 2.23708019e-02, 2.57555414e-02,
       6.50574616e-04])

In [90]:
sorted(list(zip(FI, X1_train.columns)), reverse=True)

[(0.18558717453730714, 'age'),
 (0.1765484464317555, 'capital_gain'),
 (0.11538371376022502, 'education_num'),
 (0.0940418782070826, 'hours_per_week'),
 (0.09005414490361623, 'fnlwgt'),
 (0.04529473456740924, 'capital_loss'),
 (0.04436721921925841, 'occupation_Exec-managerial'),
 (0.025755541435098592, 'sex_Male'),
 (0.022370801888645, 'sex_Female'),
 (0.020212853377476622, 'occupation_Prof-specialty'),
 (0.01707941534267116, 'education_Bachelors'),
 (0.014018000083891478, 'workclass_Self-emp-inc'),
 (0.011428410001409792, 'education_Prof-school'),
 (0.010822657449931366, 'workclass_Private'),
 (0.009926202937502585, 'education_Masters'),
 (0.009723564036258191, 'occupation_Other-service'),
 (0.009557074128251642, 'occupation_Sales'),
 (0.00788533221393955, 'education_HS-grad'),
 (0.0070888602157223885, 'workclass_Self-emp-not-inc'),
 (0.006920926572311059, 'occupation_Craft-repair'),
 (0.006612854114995777, 'education_Some-college'),
 (0.006435921778218898, 'workclass_Federal-gov'),
 

In [None]:
#Confusion matrix & Classification Report

In [91]:
print(confusion_matrix(Y1_test, Y1_pred))
print(classification_report(Y1_test, Y1_pred))

[[720  35]
 [134 111]]
              precision    recall  f1-score   support

       <=50K       0.84      0.95      0.89       755
        >50K       0.76      0.45      0.57       245

    accuracy                           0.83      1000
   macro avg       0.80      0.70      0.73      1000
weighted avg       0.82      0.83      0.81      1000



In [None]:
#KNN Algorithm

In [93]:
scaler = StandardScaler()

In [94]:
scaler.fit(modelling_df.drop("income",axis=1))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [95]:
scaled_features = scaler.transform(modelling_df.drop("income", axis=1))

In [96]:
X2_train, X2_test, Y2_train, Y2_test = train_test_split(scaled_features, df["income"], test_size=0.30, random_state=101)

In [97]:
from sklearn.neighbors import KNeighborsClassifier

In [98]:
knn = KNeighborsClassifier(n_neighbors = 1)

In [99]:
knn.fit(X2_train, Y2_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [100]:
pred = knn.predict(X2_test)

In [101]:
from sklearn.metrics import classification_report, confusion_matrix

In [102]:
print(classification_report(Y2_test, pred),confusion_matrix(Y2_test, pred))

              precision    recall  f1-score   support

       <=50K       0.82      0.82      0.82      1128
        >50K       0.46      0.46      0.46       372

    accuracy                           0.73      1500
   macro avg       0.64      0.64      0.64      1500
weighted avg       0.73      0.73      0.73      1500
 [[925 203]
 [200 172]]


In [103]:
from sklearn.model_selection import GridSearchCV

In [104]:
params = {'n_neighbors': range(1,40)}

In [106]:
gsv1 = GridSearchCV(knn, params, verbose = 3)

In [107]:
gsv1.fit(X2_train, Y2_train)

Fitting 5 folds for each of 39 candidates, totalling 195 fits
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....................... n_neighbors=1, score=0.744, total=   0.3s
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ....................... n_neighbors=1, score=0.747, total=   0.3s
[CV] n_neighbors=1 ...................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ....................... n_neighbors=1, score=0.764, total=   0.3s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.739, total=   0.3s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.726, total=   0.3s
[CV] n_neighbors=2 ...................................................
[CV] ....................... n_neighbors=2, score=0.799, total=   0.3s
[CV] n_neighbors=2 ...................................................
[CV] ....................... n_neighbors=2, score=0.780, total=   0.3s
[CV] n_neighbors=2 ...................................................
[CV] ....................... n_neighbors=2, score=0.780, total=   0.3s
[CV] n_neighbors=2 ...................................................
[CV] ....................... n_neighbors=2, score=0.796, total=   0.3s
[CV] n_neighbors=2 ...................................................
[CV] .

[Parallel(n_jobs=1)]: Done 195 out of 195 | elapsed:   56.1s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=1, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(1, 40)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=3)

In [108]:
gsv1.best_params_

{'n_neighbors': 29}

In [109]:
knn29 = KNeighborsClassifier(n_neighbors=29)

In [110]:
knn29.fit(X2_train,Y2_train)
pred29 = knn.predict(X2_test)

In [111]:
print(classification_report(Y2_test, pred29),confusion_matrix(Y2_test, pred29))

              precision    recall  f1-score   support

       <=50K       0.82      0.82      0.82      1128
        >50K       0.46      0.46      0.46       372

    accuracy                           0.73      1500
   macro avg       0.64      0.64      0.64      1500
weighted avg       0.73      0.73      0.73      1500
 [[925 203]
 [200 172]]
