# Data Gathering

In [1]:
from urllib.request import urlopen
from io import BytesIO

In [2]:
def parse_binary_data_uci_repo(URL):
  with urlopen(URL) as fptr:
    return BytesIO(fptr.read())

In [3]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
data = parse_binary_data_uci_repo(DATA_URL)

TEST_DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
test_data = parse_binary_data_uci_repo(TEST_DATA_URL)

HEADER_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
data_info = parse_binary_data_uci_repo(HEADER_URL)
header_info = data_info.readlines()[-1:-15:-1]

In [4]:
columns = []
for header in header_info[::-1]:
  columns.append(header.decode().split(':')[0])
columns.append("Income")

# Data Ingestion

In [5]:
import pandas as pd
import numpy as np

In [6]:
data.seek(0)
df = pd.read_csv(data, skip_blank_lines=True, header=None, names=columns, sep=' *, *', na_values='?')

  return func(*args, **kwargs)


In [7]:
test_data.seek(0)
test_df = pd.read_csv(test_data, skip_blank_lines=True, header=None, skiprows=1, names=columns, sep=' *, *', na_values='?')

In [8]:
df = df.sample(frac=0.3)
test_df = test_df.sample(frac=0.2)

# EDA

In [9]:
print("Shape:\n1.", df.shape, "\n2.", test_df.shape)

Shape:
1. (9768, 15) 
2. (3256, 15)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9768 entries, 22039 to 13786
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             9768 non-null   int64 
 1   workclass       9205 non-null   object
 2   fnlwgt          9768 non-null   int64 
 3   education       9768 non-null   object
 4   education-num   9768 non-null   int64 
 5   marital-status  9768 non-null   object
 6   occupation      9203 non-null   object
 7   relationship    9768 non-null   object
 8   race            9768 non-null   object
 9   sex             9768 non-null   object
 10  capital-gain    9768 non-null   int64 
 11  capital-loss    9768 non-null   int64 
 12  hours-per-week  9768 non-null   int64 
 13  native-country  9574 non-null   object
 14  Income          9768 non-null   object
dtypes: int64(6), object(9)
memory usage: 1.2+ MB


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3256 entries, 9719 to 4241
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             3256 non-null   int64 
 1   workclass       3045 non-null   object
 2   fnlwgt          3256 non-null   int64 
 3   education       3256 non-null   object
 4   education-num   3256 non-null   int64 
 5   marital-status  3256 non-null   object
 6   occupation      3045 non-null   object
 7   relationship    3256 non-null   object
 8   race            3256 non-null   object
 9   sex             3256 non-null   object
 10  capital-gain    3256 non-null   int64 
 11  capital-loss    3256 non-null   int64 
 12  hours-per-week  3256 non-null   int64 
 13  native-country  3202 non-null   object
 14  Income          3256 non-null   object
dtypes: int64(6), object(9)
memory usage: 407.0+ KB


In [12]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,9768.0,38.448301,13.553152,17.0,28.0,37.0,47.0,90.0
fnlwgt,9768.0,189764.671478,106194.057079,14878.0,117833.0,178564.0,238783.75,1484705.0
education-num,9768.0,10.083129,2.563866,1.0,9.0,10.0,12.0,16.0
capital-gain,9768.0,1027.671683,6999.231327,0.0,0.0,0.0,0.0,99999.0
capital-loss,9768.0,83.349304,394.060542,0.0,0.0,0.0,0.0,4356.0
hours-per-week,9768.0,40.585176,12.608498,1.0,40.0,40.0,45.0,99.0


In [13]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Income'],
      dtype='object')

# Data Cleaning

In [14]:
print("Null values")
print("Train dataset:", df.isna().sum().sum())
print("Test dataset:", test_df.isna().sum().sum())

Null values
Train dataset: 1322
Test dataset: 476


In [15]:
df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [16]:
print("Null values")
print("Train dataset:", df.isna().sum().sum())
print("Test dataset:", test_df.isna().sum().sum())

Null values
Train dataset: 0
Test dataset: 0


In [17]:
print("Duplicate values")
print("Duplicate training:", df.duplicated().sum())
print("Duplicate testing:", test_df.duplicated().sum())

Duplicate values
Duplicate training: 1
Duplicate testing: 0


In [18]:
df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [19]:
print("Duplicate values")
print("Duplicate training:", df.duplicated().sum())
print("Duplicate testing:", test_df.duplicated().sum())

Duplicate values
Duplicate training: 0
Duplicate testing: 0


In [20]:
print("Irrelavent and duplicate deletion")
df.drop(labels=['education', 'fnlwgt'], axis=1, inplace=True)
test_df.drop(labels=['education', 'fnlwgt'], axis=1, inplace=True)

Irrelavent and duplicate deletion


In [21]:
df.shape

(9016, 13)

In [22]:
test_df.shape

(2994, 13)

In [23]:
def unique_values_count(frame: pd.DataFrame) -> pd.DataFrame:
  columnsToEncode = list(frame.select_dtypes(include=['object']))
  for feature in columnsToEncode:
    print(feature, "\n")
    print(frame[[feature]].value_counts())

In [24]:
unique_values_count(df)

workclass 

workclass       
Private             6618
Self-emp-not-inc     756
Local-gov            640
State-gov            378
Self-emp-inc         343
Federal-gov          277
Without-pay            4
dtype: int64
marital-status 

marital-status       
Married-civ-spouse       4216
Never-married            2946
Divorced                 1231
Separated                 287
Widowed                   211
Married-spouse-absent     120
Married-AF-spouse           5
dtype: int64
occupation 

occupation       
Exec-managerial      1222
Prof-specialty       1190
Craft-repair         1181
Adm-clerical         1111
Sales                1072
Other-service         949
Machine-op-inspct     583
Transport-moving      476
Handlers-cleaners     419
Farming-fishing       306
Tech-support          274
Protective-serv       193
Priv-house-serv        36
Armed-Forces            4
dtype: int64
relationship 

relationship  
Husband           3757
Not-in-family     2333
Own-child         1340
Unmarried     

In [25]:
df['Income'] = df['Income'].apply(lambda x: 0 if x == "<=50K" else 1)
test_df['Income'] = test_df['Income'].apply(lambda x: 0 if x == "<=50K." else 1)

In [26]:
df[['Income']].value_counts()

Income
0         6790
1         2226
dtype: int64

In [27]:
test_df[['Income']].value_counts()

Income
0         2226
1          768
dtype: int64

# Imputation

In [28]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [29]:
object_columns = df.select_dtypes(include=['object']).columns
object_columns = [c for c in object_columns if c != 'Income']

In [30]:
object_columns

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [31]:
X_train = df.iloc[:, :-1]
X_test = test_df.iloc[:, :-1]

In [32]:
y_train = df.iloc[:, -1]
y_test = test_df.iloc[:, -1]

In [33]:
print("Train: ", X_train.shape, y_train.shape)
print("Test: ", X_test.shape, y_test.shape)

Train:  (9016, 12) (9016,)
Test:  (2994, 12) (2994,)


In [34]:
column_transform = make_column_transformer(
    (OneHotEncoder(handle_unknown="ignore", dtype=np.int64), object_columns),
    remainder='passthrough'
)

# Bagging Classifier

In [35]:
from sklearn.ensemble import BaggingClassifier

In [36]:
bg_c = BaggingClassifier()

In [37]:
pl_bg_c = make_pipeline(column_transform, bg_c)

In [38]:
pl_bg_c.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('baggingclassifier', BaggingClassifier())])

In [39]:
pl_bg_c.score(X_train, y_train)

0.9739352262644189

In [40]:
y_predict_bg_c = pl_bg_c.predict(X_test)

In [41]:
accuracy_score(y_test, y_predict_bg_c)

0.8390113560454242

# Extra Trees Classifier

In [42]:
from sklearn.ensemble import ExtraTreesClassifier

In [43]:
et_c = ExtraTreesClassifier()

In [44]:
et_bg_c = make_pipeline(column_transform, et_c)

In [45]:
et_bg_c.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('extratreesclassifier', ExtraTreesClassifier())])

In [46]:
et_bg_c.score(X_train, y_train)

0.9856921029281278

In [47]:
y_predict_et_c = et_bg_c.predict(X_test)

In [48]:
accuracy_score(y_test, y_predict_et_c)

0.8303273213092852

# Gradient Boosting Classifier

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

In [50]:
gb_c = GradientBoostingClassifier()

In [51]:
p_gb_c = make_pipeline(column_transform, gb_c)

In [52]:
p_gb_c.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('gradientboostingclassifier', GradientBoostingClassifier())])

In [53]:
p_gb_c.score(X_train, y_train)

0.8661268855368234

In [54]:
y_predict_gb_c = p_gb_c.predict(X_test)

In [55]:
accuracy_score(y_test, y_predict_gb_c)

0.8557114228456913

# Random Forest Classifier

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
rf_c = RandomForestClassifier()

In [58]:
p_rf_c = make_pipeline(column_transform, rf_c)

In [59]:
p_rf_c.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country'])])),
                ('randomforestclassifier', RandomForestClassifier())])

In [60]:
p_rf_c.score(X_train, y_train)

0.985581188997338

In [61]:
y_predict_rf_c = p_rf_c.predict(X_test)

In [62]:
accuracy_score(y_test, y_predict_gb_c)

0.8557114228456913

# Stacking Classifier

In [63]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [64]:
estimators = [
    ('rf', make_pipeline(column_transform, RandomForestClassifier(n_estimators=10))),
    ('svc', make_pipeline(column_transform, SVC()))
]

In [65]:
st_c = StackingClassifier(
    estimators = estimators,
    final_estimator=LogisticRegression()
  )

In [66]:
st_c.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                Pipeline(steps=[('columntransformer',
                                                 ColumnTransformer(remainder='passthrough',
                                                                   transformers=[('onehotencoder',
                                                                                  OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                                                handle_unknown='ignore'),
                                                                                  ['workclass',
                                                                                   'marital-status',
                                                                                   'occupation',
                                                                                   'relationship',
                                                             

In [67]:
st_c.score(X_train, y_train)

0.9615128660159716

In [68]:
st_c.classes_

array([0, 1])

In [69]:
st_c.decision_function(X_test)

array([-2.63046819, -2.63051471, -2.63050362, ..., -2.63049999,
       -0.92838937,  1.1992347 ])

In [70]:
y_predict_st_c = st_c.predict(X_test)

In [71]:
accuracy_score(y_test, y_predict_st_c)

0.8323313293253173

# Voting Classifier

In [72]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [73]:
voting_estimators = [
    ('lr', make_pipeline(column_transform, LogisticRegression(random_state=1))),
    ('rfc', make_pipeline(column_transform, RandomForestClassifier(n_estimators=10, random_state=1))),
    ('svc', make_pipeline(column_transform, SVC()))
]

In [74]:
v_c = VotingClassifier(estimators=voting_estimators, voting='hard')

In [75]:
v_c.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('lr',
                              Pipeline(steps=[('columntransformer',
                                               ColumnTransformer(remainder='passthrough',
                                                                 transformers=[('onehotencoder',
                                                                                OneHotEncoder(dtype=<class 'numpy.int64'>,
                                                                                              handle_unknown='ignore'),
                                                                                ['workclass',
                                                                                 'marital-status',
                                                                                 'occupation',
                                                                                 'relationship',
                                                                                 

In [76]:
v_c.score(X_train, y_train)

0.876109139307897

In [77]:
y_predict_v_c = v_c.predict(X_test)

In [78]:
accuracy_score(y_test, y_predict_v_c)

0.8229792919171677

**==========THE END==========**