# Comparitive Study of an Adult Dataset

In [1]:
import pandas as pd

## Data Preprocessing

### Reading the dataset

In [2]:
df = pd.read_csv('adult.data',header=None,names=["Age", "Workclass", "fnlwgt", "Education", "Education num", "Marital Status", "Occupation","Relationship", "Race", "Sex", "Capital gain","Capital loss", "Hours per week", "Native country","Income level"])

In [3]:
df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education num,Marital Status,Occupation,Relationship,Race,Sex,Capital gain,Capital loss,Hours per week,Native country,Income level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### Label encoding the output values

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
label_encoder_y = LabelEncoder()

In [6]:
label = label_encoder_y.fit_transform(df[df.columns[-1]])

In [7]:
label

array([0, 0, 0, ..., 0, 0, 1])

### Identifying the columns with missing values in the dataset

In [8]:
miss = []

In [9]:
for pos in df.columns:
    for row in range(0,df.shape[0]):
        if(df.at[row,pos]==' ?' and pos not in miss):
            miss.append(pos)

In [10]:
miss

['Workclass', 'Occupation', 'Native country']

### Replace all missing values with the mode of the column it belongs to

In [11]:
for pos in miss:
    df[pos] = df[pos].replace(' ?', str(df[pos].mode()))

In [12]:
df

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education num,Marital Status,Occupation,Relationship,Race,Sex,Capital gain,Capital loss,Hours per week,Native country,Income level
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### Importing all the algorithms and libraries

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
from sklearn.naive_bayes import CategoricalNB

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
from sklearn.pipeline import Pipeline

### Label Encoding all the qualitative data

In [20]:
y_val = df.columns[-1]

In [21]:
nqual = []

In [22]:
for pos in df.columns:
    if(df[pos].dtypes!='int64' and pos!=y_val):
        nqual.append(pos)

In [23]:
label_object = {}

In [24]:
for pos in nqual:
    labelencoder = LabelEncoder()
    labelencoder.fit(df[pos])
    df[pos+' Encode'] = labelencoder.fit_transform(df[pos])
    label_object[pos] = labelencoder

In [25]:
nqual.append(y_val)

In [26]:
df = df.drop(columns=nqual)

In [27]:
df

Unnamed: 0,Age,fnlwgt,Education num,Capital gain,Capital loss,Hours per week,Workclass Encode,Education Encode,Marital Status Encode,Occupation Encode,Relationship Encode,Race Encode,Sex Encode,Native country Encode
0,39,77516,13,2174,0,40,6,9,4,0,1,4,1,38
1,50,83311,13,0,0,13,5,9,2,3,0,4,1,38
2,38,215646,9,0,0,40,3,11,0,5,1,4,1,38
3,53,234721,7,0,0,40,3,1,2,5,0,2,1,38
4,28,338409,13,0,0,40,3,9,2,9,5,2,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,3,7,2,12,5,4,0,38
32557,40,154374,9,0,0,40,3,11,2,6,0,4,1,38
32558,58,151910,9,0,0,40,3,11,6,0,4,4,0,38
32559,22,201490,9,0,0,20,3,11,4,0,3,4,1,38


## Predicitng the best models

In [28]:
pipeline=Pipeline([('scale', StandardScaler()),
                   ('classifier',DecisionTreeClassifier())])

In [29]:
param_neighbors = []
for pos in range(29,51):
    param_neighbors.append(pos)

In [30]:
search_space = [{'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__min_samples_split': [2, 3, 4, 5],
                 'classifier__criterion': ['entropy'],
                 'scale': [None]},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__min_samples_split': [2, 3, 4 ,5],
                 'classifier__criterion': ['gini'],
                 'scale': [None]},
                {'classifier': [DecisionTreeClassifier()],
                 'classifier__max_depth': [10, 20, 40],
                 'classifier__min_samples_split': [2, 3, 4, 5],
                 'classifier__criterion': ['gini'],
                 'scale':[None]},
                {'classifier': [DecisionTreeClassifier()],
                 'classifier__max_depth': [10, 20, 40],
                 'classifier__min_samples_split': [2, 3, 4, 5],
                 'classifier__criterion': ['entropy'],
                 'scale':[None]},
                {'classifier': [CategoricalNB()],
                 'classifier__alpha': [0, 1, 2],
                 'scale':[None]},
                {'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [param_neighbors],
                 'classifier__metric':['euclidean']},
                {'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [param_neighbors],
                 'classifier__metric':['minkowski']},
                {'classifier': [KNeighborsClassifier()],
                 'classifier__n_neighbors': [param_neighbors],
                 'classifier__metric':['manhattan']}]

In [31]:
models = GridSearchCV(pipeline, search_space)

In [32]:
models.fit(df,label)

Traceback (most recent call last):
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/metaestimators.py", line 120, in <lambda>
    out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 622, in score
    return self.steps[-1][-1].score(Xt, y, **score_params)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 500, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklea

Traceback (most recent call last):
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 179, in fit
    return self._fit(X, y)
  File "/Users/vanshgadhia/opt/anaconda3/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 487, in _fit
    self.n_neighbors >= self._fit_X.shape[0] // 2)):
TypeError: '>=' not supported between instances of 'list' and 'int'

 0.85283009 0.85838895 0.85927958 0.85224651 0.85974026 0.86072297
 0.84874545 0.85544063 0.85759046 0.84972832 0.857959   0.85952525
 0.84988185 0.86004733 0.85958662 0.85175517 0.86013953 0.86072303
 0.85154027 0.851

GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('classifier',
                                        DecisionTreeClassifier())]),
             param_grid=[{'classifier': [RandomForestClassifier()],
                          'classifier__criterion': ['entropy'],
                          'classifier__min_samples_split': [2, 3, 4, 5],
                          'classifier__n_estimators': [10, 100, 1000],
                          'scale': [None]},
                         {'classifier': [RandomForestClassifier(min_samples_spli...
                                                       47, 48, 49, 50]]},
                         {'classifier': [KNeighborsClassifier()],
                          'classifier__metric': ['minkowski'],
                          'classifier__n_neighbors': [[29, 30, 31, 32, 33, 34,
                                                       35, 36, 37, 38, 39, 40,
                                            

In [33]:
df_grid = pd.DataFrame(data=models.cv_results_)

In [35]:
df_grid[df_grid['rank_test_score']<=5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__criterion,param_classifier__min_samples_split,param_classifier__n_estimators,param_scale,param_classifier__max_depth,...,param_classifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,1.884926,0.06116,0.090534,0.003865,RandomForestClassifier(),entropy,5,100,,,...,,"{'classifier': RandomForestClassifier(), 'clas...",0.857055,0.854269,0.860104,0.866093,0.861179,0.85974,0.003993,5
11,19.629254,0.573271,1.032455,0.102821,RandomForestClassifier(),entropy,5,1000,,,...,,"{'classifier': RandomForestClassifier(), 'clas...",0.859972,0.855958,0.859337,0.8664,0.861947,0.860723,0.003433,2
19,1.69358,0.016747,0.090105,0.002035,"RandomForestClassifier(min_samples_split=5, n_...",gini,4,100,,,...,,{'classifier': RandomForestClassifier(min_samp...,0.859051,0.853655,0.860412,0.86594,0.861179,0.860047,0.003949,4
22,1.645998,0.018063,0.08745,0.000239,"RandomForestClassifier(min_samples_split=5, n_...",gini,5,100,,,...,,{'classifier': RandomForestClassifier(min_samp...,0.856902,0.858722,0.858569,0.864711,0.861794,0.86014,0.002778,3
23,16.459997,0.174181,0.895809,0.048782,"RandomForestClassifier(min_samples_split=5, n_...",gini,5,1000,,,...,,{'classifier': RandomForestClassifier(min_samp...,0.858283,0.856112,0.858262,0.867015,0.863943,0.860723,0.00408,1
