In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV

# data understanding and cleaning

In [2]:
#read csv file and put it into df object
df = pd.read_csv('adult_dataset.csv')


In [3]:
#see the type of data
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [4]:
# select all categorical variables
df_categorical = df.select_dtypes(include=['object'])
# checking whether any other columns contain a "?"
df_categorical.apply(lambda x: x=="?", axis=0).sum()

workclass         1836
education            0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
native.country     583
income               0
dtype: int64

In [5]:
#removing the null values from the data(? resembles the null values in this dataset)
df_1 = df[df.workclass != '?']
df_1 = df_1[df_1.occupation != '?']
df_1 = df_1[df_1['native.country'] != '?']

In [6]:
#checking the shape to know how many rows are removed from the dataset
df_1.shape
df_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 15 columns):
age               30162 non-null int64
workclass         30162 non-null object
fnlwgt            30162 non-null int64
education         30162 non-null object
education.num     30162 non-null int64
marital.status    30162 non-null object
occupation        30162 non-null object
relationship      30162 non-null object
race              30162 non-null object
sex               30162 non-null object
capital.gain      30162 non-null int64
capital.loss      30162 non-null int64
hours.per.week    30162 non-null int64
native.country    30162 non-null object
income            30162 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
#one hot encoding
df_1_categorical = df_1.select_dtypes(include=['object'])
income_dummies = pd.get_dummies(df_1_categorical, drop_first=True)
income_dummies.head()

Unnamed: 0,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_>50K
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
#removing the original columns for which onehot encoding is done
df_1 = df_1.drop(list(df_1_categorical.columns), axis=1)
#concatinating df_1 and income dummies
df_1=pd.concat([df_1, income_dummies],axis=1)

In [9]:
#checking the shape to know how many columns have been added in the dataset after onehot encoding
df_1

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_>50K
1,82,132870,9,0,4356,18,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,54,140359,4,0,3900,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,41,264663,10,0,3900,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
5,34,216864,9,0,3770,45,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
6,38,150601,6,0,3770,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,310152,10,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
32557,27,257302,12,0,0,38,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
32558,40,154374,9,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
32559,58,151910,9,0,0,40,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


# Data Splitting

In [10]:
#dividing input and output columns (input festure variables to X and response/output to Y)
def inputOutput(df):
    X = df.drop('income_>50K', axis=1)
    Y = df['income_>50K']
    return X,Y

In [11]:
#splitting the data for testing and training
def split(X,Y):
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8,test_size = 0.2, random_state=100)
    return X_train, X_test, Y_train, Y_test

In [12]:
# while using some algorithms I have used only a part of dataset due to memory issues
def sampleSplit(df):
    x_sample= df.drop('income_>50K',axis=1)[27000:30000]
    y_sample=df['income_>50K'][27000:30000]
    return x_sample, y_sample

# Decision Tree with RFE to reduce dimentionality

In [13]:
#using decision tree with RFE to reduce the no.of columns from 97 to 50
D_tree = DecisionTreeClassifier()
rfe = RFE(D_tree,50)
X,Y = inputOutput(df_1)
rfe_1 = rfe.fit(X,Y)

In [14]:
y_original=df_1['income_>50K']

In [15]:
col = X.columns[rfe.support_]
df_2=pd.concat([X[col], y_original],axis=1)

In [16]:
df_2.shape

(30162, 51)

# Normalization

In [17]:
#used MinMaxScaler to scale all my column values to be between 0 and 1
scaler=MinMaxScaler(feature_range=(0,1))
df_3=scaler.fit_transform(df_2)
df_3=pd.DataFrame(df_3)
df_3.columns = df_2.columns
df_3.shape

(30162, 51)

In [18]:
df_3.head(10)

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Canada,native.country_Cuba,native.country_England,native.country_Germany,native.country_India,native.country_Italy,native.country_Philippines,native.country_South,native.country_United-States,income_>50K
0,0.890411,0.08097,0.533333,0.0,1.0,0.173469,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.506849,0.086061,0.2,0.0,0.895317,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.328767,0.170568,0.6,0.0,0.895317,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.232877,0.138072,0.533333,0.0,0.865473,0.44898,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.287671,0.093024,0.333333,0.0,0.865473,0.397959,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.780822,0.050899,1.0,0.0,0.8455,0.193878,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
6,0.69863,0.27754,0.533333,0.0,0.8455,0.397959,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.383562,0.107758,1.0,0.0,0.689624,0.346939,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
8,0.287671,0.102491,0.933333,0.0,0.648301,0.44898,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
9,0.479452,0.078459,0.8,0.0,0.648301,0.193878,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [19]:
#correlation matrix
df_3.corr(method ='pearson') 

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native.country_Canada,native.country_Cuba,native.country_England,native.country_Germany,native.country_India,native.country_Italy,native.country_Philippines,native.country_South,native.country_United-States,income_>50K
age,1.0,-0.076511,0.043526,0.080154,0.060165,0.101599,0.068256,-0.210491,0.111039,0.150429,...,0.013974,0.029878,0.011472,0.00625,-0.001528,0.029124,0.007944,0.001923,0.016259,0.241998
fnlwgt,-0.076511,1.0,-0.044992,0.000422,-0.00975,-0.022886,-0.00352,0.046589,-0.025496,-0.041453,...,-0.004815,0.030507,-0.00329,0.000673,-0.011603,-0.003065,-0.018957,-0.010598,-0.08339,-0.008957
education.num,0.043526,-0.044992,1.0,0.124416,0.079646,0.152522,0.097378,-0.165069,0.078843,0.010605,...,0.012907,-0.009233,0.021109,0.025296,0.052218,-0.025838,0.026299,0.017812,0.127207,0.335286
capital.gain,0.080154,0.000422,0.124416,1.0,-0.032229,0.080432,-0.009624,-0.048185,0.096482,0.033323,...,0.00421,-0.005324,-0.001351,-0.001256,0.019569,-0.002795,-0.000208,-0.002582,0.012375,0.221196
capital.loss,0.060165,-0.00975,0.079646,-0.032229,1.0,0.052417,0.014727,-0.036377,0.030956,0.020221,...,0.008643,-0.002195,-0.000234,-0.000793,0.006098,-0.006972,0.003104,0.005677,0.015119,0.150053
hours.per.week,0.101599,-0.022886,0.152522,0.080432,0.052417,1.0,0.001612,-0.095533,0.126254,0.087835,...,0.002716,-0.005554,0.006899,0.004504,0.002883,0.005116,-0.008794,0.014328,0.010673,0.22948
workclass_Local-gov,0.068256,-0.00352,0.097378,-0.009624,0.014727,0.001612,1.0,-0.456267,-0.05212,-0.081525,...,-0.002941,-0.005485,-0.002199,0.000461,-0.011078,-0.00736,-0.004808,-0.013175,0.032937,0.028673
workclass_Private,-0.210491,0.046589,-0.165069,-0.048185,-0.036377,-0.095533,-0.456267,1.0,-0.323228,-0.505588,...,-7.6e-05,0.001401,-0.000769,0.002814,-0.005105,-0.005162,0.015432,-0.014735,-0.049809,-0.117218
workclass_Self-emp-inc,0.111039,-0.025496,0.078843,0.096482,0.030956,0.126254,-0.05212,-0.323228,1.0,-0.057754,...,0.00659,0.018571,-0.003564,0.003969,0.010704,0.002183,-0.010671,0.01651,0.00735,0.137646
workclass_Self-emp-not-inc,0.150429,-0.041453,0.010605,0.033323,0.020221,0.087835,-0.081525,-0.505588,-0.057754,1.0,...,0.004318,0.000823,0.004229,-0.00852,-0.002689,0.01868,-0.016163,0.035038,0.01452,0.025575


# Decision Tree (entropy)along with GridSearchCV

In [20]:
#gridSearchCV is used to find the best value for the hyper parameters (min_samples_leaf, min_samples_split)
param_grid = {
    'min_samples_leaf': range(0, 5),
    'min_samples_split': range(0, 5)
}

n_folds = 4

# Instantiate the grid search model
dtree = DecisionTreeClassifier(max_depth=10, criterion = "entropy", random_state=100)
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid,cv = n_folds,
                        verbose = 1)

# Fit the grid search to the data
grid_search.fit(X,Y)

Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got 0

ValueError: min_samples_leaf must be at least 1 or in (0, 0.5], got

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 0

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 0

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 0

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 0

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 0

ValueError: min_sam

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    7.1s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='entropy', max_depth=10,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=100,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'min_samples_leaf': range(0, 5),
                    

In [21]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016827,0.00094,0.0,0.0,0,0,"{'min_samples_leaf': 0, 'min_samples_split': 0}",,,,,,,25
1,0.014099,0.001533,0.0,0.0,0,1,"{'min_samples_leaf': 0, 'min_samples_split': 1}",,,,,,,24
2,0.012872,0.000327,0.0,0.0,0,2,"{'min_samples_leaf': 0, 'min_samples_split': 2}",,,,,,,21
3,0.015004,0.001843,0.0,0.0,0,3,"{'min_samples_leaf': 0, 'min_samples_split': 3}",,,,,,,20
4,0.013206,0.001325,0.0,0.0,0,4,"{'min_samples_leaf': 0, 'min_samples_split': 4}",,,,,,,19
5,0.013732,0.000813,0.0,0.0,1,0,"{'min_samples_leaf': 1, 'min_samples_split': 0}",,,,,,,16
6,0.0131,0.000746,0.0,0.0,1,1,"{'min_samples_leaf': 1, 'min_samples_split': 1}",,,,,,,15
7,0.134102,0.006835,0.00356,0.000151,1,2,"{'min_samples_leaf': 1, 'min_samples_split': 2}",0.814879,0.809574,0.823607,0.807162,0.813806,0.00631,10
8,0.132417,0.003262,0.003447,8.4e-05,1,3,"{'min_samples_leaf': 1, 'min_samples_split': 3}",0.814216,0.809707,0.82374,0.806499,0.81354,0.006496,12
9,0.130757,0.00185,0.003394,2e-05,1,4,"{'min_samples_leaf': 1, 'min_samples_split': 4}",0.814879,0.80984,0.823607,0.806897,0.813806,0.006338,11


In [22]:
#finding out the accuracy
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

best accuracy 0.81423656337398
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')


# used decision Tree with gini index

In [24]:
X_train, X_test, Y_train, Y_test=split(X,Y)
model_decision = DecisionTreeClassifier(max_depth=10, criterion = "gini", random_state=100)
model_decision.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=100, splitter='best')

In [25]:
#accuracy
y_pred = model_decision.predict(X_test)
print(metrics.accuracy_score(Y_test,y_pred))

0.8489971821647605


# linear Regression

In [33]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
X,Y= inputOutput(df_3)
X_train, X_test, Y_train, Y_test= split(X,Y)
lm.fit(X_train,Y_train)
Y_pred = lm.predict(X_test)

In [34]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(Y_test, Y_pred)
r_squared = r2_score(Y_test, Y_pred)
print(mse, r_squared)

0.1209606554684492 0.35366060820464895


# SVM

In [35]:
from sklearn.svm import SVC
model = SVC(C = 1)

# fit
model.fit(X_train, Y_train)

# predict
Y_pred = model.predict(X_test)

In [36]:
#accuracy
print("accuracy", metrics.accuracy_score(Y_test, Y_pred))

accuracy 0.8286093154317918


# GridSearchCV along with SVM

In [37]:
folds = 4
params = {"C": [0.1, 1, 10, 100, 1000]}
model_cv = GridSearchCV(estimator = model, param_grid = params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        verbose = 1,
                       return_train_score=True)   
x_sample,y_sample= sampleSplit(df_3)
model_cv.fit(x_sample, y_sample)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   11.1s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=SVC(C=1, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=True, scoring='accuracy',
             verbose=1)

In [38]:
#best accuracy
best_score = model_cv.best_score_
best_C = model_cv.best_params_['C']

print(" The highest test accuracy is {0} at C = {1}".format(best_score, best_C))

 The highest test accuracy is 0.8300000000000001 at C = 1


In [39]:
Y_cv_pred = model_cv.predict(X_test)

In [40]:
#accuracy
print(metrics.accuracy_score(Y_test,Y_cv_pred))

0.8105420188960716
