## Census income dataset

In [1]:
# import all the necessary packages

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from six.moves import urllib
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier

In [53]:
# Import the CSV Data as Pandas DataFrame

In [54]:
data=pd.read_csv(r"Adult.csv")

In [55]:
# Take the sample of 10000 rows from the data

In [56]:
data = data.sample(10000)

In [57]:
# Show Top 5 Records

In [58]:
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
5373,29,Private,148550,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
23738,26,Self-emp-not-inc,221626,HS-grad,9,Married-civ-spouse,Other-service,Wife,White,Female,0,1579,20,United-States,<=50K
11701,67,Self-emp-inc,411007,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,15831,0,40,United-States,>50K
29666,22,Private,279802,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
5122,34,Private,45522,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [59]:
data.shape  # shape of the dataset

(10000, 15)

In [60]:
data.describe() # Display summary statistics for a dataframe

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,38.507,189448.0,10.0499,1107.7461,86.1277,40.3342
std,13.649102,107434.0,2.577765,7475.268122,400.105668,12.305188
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,27.0,117138.2,9.0,0.0,0.0,40.0
50%,37.0,177318.0,10.0,0.0,0.0,40.0
75%,48.0,237093.8,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [61]:
# rename the columns to remove special characters

In [62]:
data.rename(columns = {'marital-status':'marital_status', 'native-country':'native_country'}, inplace = True)

In [63]:
data['income'].replace(['<=50K', '>50K'],
                        [0, 1], inplace=True)

In [64]:
data["income"] = data["income"].astype("float")

In [79]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native_country',
       'income'],
      dtype='object')

In [65]:
X=data.drop("income",axis=1)

In [66]:
y=data['income']

In [67]:
# split data into train and test and import GridSearchCV

In [68]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, test_size = 0.2) 

In [70]:
# devide the columns into category and numeric types group

In [71]:
category_var = [col for col in X.columns if X[col].dtypes == object]
category_var

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [72]:
numeric_var = [col for col in X.columns if X[col].dtypes != object]
numeric_var

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [73]:
# Feature Scaling

In [74]:
one_hot = ce.OneHotEncoder(cols = category_var, handle_unknown = 'ignore')

# Creating dataframe for categorical variables which converted to one hot encoded variables.
X_train_one_hot = pd.DataFrame(one_hot.fit_transform(X_train))
X_test_one_hot = pd.DataFrame(one_hot.transform(X_test))


X_train_one_hot.index = X_train.index
X_test_one_hot.index = X_test.index

num_X_train = X_train[numeric_var]
num_X_test = X_test[numeric_var]

# Joining numerical and one hot encoded variables to create our final X_train and X_test.
X_train_new = pd.concat([num_X_train, X_train_one_hot], axis = 1)
X_test_new = pd.concat([num_X_test, X_test_one_hot], axis = 1)

In [75]:
# Scaling our records into standard range of 0 and 1.
scaler = StandardScaler()

X_train_new = scaler.fit_transform(X_train_new)
X_test_new = scaler.transform(X_test_new)

## Bagging classifier

In [97]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [99]:
model_bagging_svc = BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0).fit(X_train_new, y_train)

In [100]:
y_predict_bagging=model_bagging_svc.predict(X_test_new)

In [101]:
accuracy_score(y_test,y_predict_bagging)

0.845

## extra tree classifier

In [90]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification

In [91]:
model_ExtraTreeClassifier = ExtraTreesClassifier(n_estimators=100, random_state=0)

In [92]:
model_ExtraTreeClassifier.fit(X_train_new,y_train)

ExtraTreesClassifier(random_state=0)

In [93]:
ExtraTreesClassifier(random_state=0)

ExtraTreesClassifier(random_state=0)

In [94]:
y_predict_ETC = model_ExtraTreeClassifier.predict(X_test_new)

In [95]:
accuracy_score(y_test,y_predict_ETC)

0.8375

## voting classifer

In [102]:
estimator = []
estimator.append(('LR', LogisticRegression(solver ='lbfgs', multi_class ='multinomial', max_iter = 200)))
estimator.append(('SVC', SVC(gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier()))

In [105]:
Hard = VotingClassifier(estimators = estimator, voting ='hard')
Hard.fit(X_train_new, y_train)
y_pred = Hard.predict(X_test_new)

In [108]:
Accuracy_vc = accuracy_score(y_test, y_pred)

In [109]:
Accuracy_vc

0.8475

## Random forest classifier

In [86]:
from sklearn.ensemble import RandomForestClassifier
model_RF=RandomForestClassifier()

In [87]:
model_RF.fit(X_train_new,y_train)

RandomForestClassifier()

In [88]:
y_pred_rf=model_RF.predict(X_test_new)

In [89]:
accuracy_score(y_test,y_pred_rf)

0.8535