# Census Prediction

## Importing Libraries

In [336]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from catboost import CatBoostClassifier
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

In [238]:
df = pd.read_csv('Problem statement 1_dataset/adult.csv')

In [239]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [240]:
df.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [241]:
df.shape

(32561, 15)

In [242]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
country           0
salary            0
dtype: int64

In [243]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
country           object
salary            object
dtype: object

In [275]:
df_fin = df.drop(['marital-status', 'relationship', 'capital-loss'], axis = 1)

In [276]:
df_fin.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,occupation,race,sex,capital-gain,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Adm-clerical,White,Male,2174,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Exec-managerial,White,Male,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Handlers-cleaners,White,Male,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Handlers-cleaners,Black,Male,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Prof-specialty,Black,Female,0,40,Cuba,<=50K


In [277]:
col_name = list(df_fin.columns)
col_name

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education-num',
 'occupation',
 'race',
 'sex',
 'capital-gain',
 'hours-per-week',
 'country',
 'salary']

In [278]:
categorical_features = list(df_fin.select_dtypes(include=['object']).columns)
numerical_features = list(df_fin.select_dtypes(include=['int', 'float']).columns)

In [279]:
labelEncode = preprocessing.LabelEncoder()
for i in categorical_features:
    df_fin[i] = labelEncode.fit_transform(df_fin[i])
 
df_fin.head(20)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,occupation,race,sex,capital-gain,hours-per-week,country,salary
0,39,7,77516,9,13,1,4,1,2174,40,39,0
1,50,6,83311,9,13,4,4,1,0,13,39,0
2,38,4,215646,11,9,6,4,1,0,40,39,0
3,53,4,234721,1,7,6,2,1,0,40,39,0
4,28,4,338409,9,13,10,2,0,0,40,5,0
5,37,4,284582,12,14,4,4,0,0,40,39,0
6,49,4,160187,6,5,8,2,0,0,16,23,0
7,52,6,209642,11,9,4,4,1,0,45,39,1
8,31,4,45781,12,14,10,4,0,14084,50,39,1
9,42,4,159449,9,13,4,4,1,5178,40,39,1


In [288]:
# target_col = df_fin["salary"]
X = df_fin.iloc[:,0:11]
y = df_fin.iloc[:,-1]

In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=60)

## Splitting the Data

In [290]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,occupation,race,sex,capital-gain,hours-per-week,country
15492,44,6,38876,11,9,5,4,1,0,40,39
26742,49,2,275074,9,13,10,4,1,0,60,39
6010,35,4,189916,9,13,12,4,0,0,30,39
2320,40,4,234633,15,10,3,4,1,0,40,39
15794,24,4,148320,15,10,1,2,1,0,40,39


In [291]:
features = list(X_train.columns)

In [312]:
cat_features = features

## CatBoost Classification

In [316]:
model_cb = CatBoostClassifier(task_type='GPU', iterations=500, random_state = 2022, eval_metric="F1")

In [317]:
model_cb.fit(X_train, y_train, cat_features= cat_features, plot=True, eval_set=(X_test, y_test))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.072734
0:	learn: 0.5625541	test: 0.5552163	best: 0.5552163 (0)	total: 36.5ms	remaining: 18.2s
1:	learn: 0.5594323	test: 0.5631015	best: 0.5631015 (1)	total: 2.26s	remaining: 9m 22s
2:	learn: 0.5758521	test: 0.5688769	best: 0.5688769 (2)	total: 2.3s	remaining: 6m 20s
3:	learn: 0.5751331	test: 0.5687924	best: 0.5688769 (2)	total: 2.3s	remaining: 6m 20s
4:	learn: 0.5757260	test: 0.5691807	best: 0.5691807 (4)	total: 4.51s	remaining: 9m 18s
5:	learn: 0.5796097	test: 0.5717385	best: 0.5717385 (5)	total: 4.55s	remaining: 7m 29s
6:	learn: 0.5698246	test: 0.5654450	best: 0.5717385 (5)	total: 4.59s	remaining: 6m 17s
7:	learn: 0.5684433	test: 0.5618969	best: 0.5717385 (5)	total: 4.62s	remaining: 5m 24s
8:	learn: 0.5739252	test: 0.5637363	best: 0.5717385 (5)	total: 4.65s	remaining: 4m 45s
9:	learn: 0.5760315	test: 0.5640044	best: 0.5717385 (5)	total: 4.69s	remaining: 4m 15s
10:	learn: 0.5794824	test: 0.5741393	best: 0.5741393 (10)	total: 4.72s	remaining: 3m 50s
11:	learn: 0.

98:	learn: 0.6236255	test: 0.6169561	best: 0.6191697 (84)	total: 13.9s	remaining: 58.8s
99:	learn: 0.6236255	test: 0.6169561	best: 0.6191697 (84)	total: 14s	remaining: 58.2s
100:	learn: 0.6234897	test: 0.6166411	best: 0.6191697 (84)	total: 14s	remaining: 57.5s
101:	learn: 0.6234501	test: 0.6166794	best: 0.6191697 (84)	total: 14s	remaining: 56.9s
102:	learn: 0.6235998	test: 0.6166794	best: 0.6191697 (84)	total: 14s	remaining: 56.3s
103:	learn: 0.6238313	test: 0.6175421	best: 0.6191697 (84)	total: 14.1s	remaining: 55.7s
104:	learn: 0.6237774	test: 0.6174223	best: 0.6191697 (84)	total: 14.1s	remaining: 55.2s
105:	learn: 0.6233428	test: 0.6182837	best: 0.6191697 (84)	total: 14.1s	remaining: 54.6s
106:	learn: 0.6233569	test: 0.6182837	best: 0.6191697 (84)	total: 14.1s	remaining: 54.6s
107:	learn: 0.6244030	test: 0.6179689	best: 0.6191697 (84)	total: 14.2s	remaining: 53.9s
108:	learn: 0.6241181	test: 0.6183575	best: 0.6191697 (84)	total: 14.2s	remaining: 53.4s
109:	learn: 0.6246337	test: 0.6

191:	learn: 0.6387874	test: 0.6261307	best: 0.6274805 (182)	total: 23.1s	remaining: 38.6s
192:	learn: 0.6392262	test: 0.6264758	best: 0.6274805 (182)	total: 23.1s	remaining: 38.6s
193:	learn: 0.6391575	test: 0.6264758	best: 0.6274805 (182)	total: 23.1s	remaining: 38.2s
194:	learn: 0.6386591	test: 0.6264758	best: 0.6274805 (182)	total: 25.3s	remaining: 41.5s
195:	learn: 0.6386591	test: 0.6268207	best: 0.6274805 (182)	total: 25.3s	remaining: 41.2s
196:	learn: 0.6386591	test: 0.6264758	best: 0.6274805 (182)	total: 25.4s	remaining: 40.9s
197:	learn: 0.6388740	test: 0.6266633	best: 0.6274805 (182)	total: 25.4s	remaining: 40.6s
198:	learn: 0.6390202	test: 0.6256591	best: 0.6274805 (182)	total: 25.4s	remaining: 40.3s
199:	learn: 0.6393812	test: 0.6256591	best: 0.6274805 (182)	total: 25.5s	remaining: 40s
200:	learn: 0.6398023	test: 0.6263184	best: 0.6274805 (182)	total: 25.5s	remaining: 39.7s
201:	learn: 0.6400945	test: 0.6263184	best: 0.6274805 (182)	total: 25.5s	remaining: 39.4s
202:	learn: 

288:	learn: 0.6456575	test: 0.6252194	best: 0.6274805 (182)	total: 39s	remaining: 29.9s
289:	learn: 0.6456575	test: 0.6252194	best: 0.6274805 (182)	total: 39s	remaining: 29.7s
290:	learn: 0.6459468	test: 0.6252194	best: 0.6274805 (182)	total: 39s	remaining: 29.4s
291:	learn: 0.6461604	test: 0.6252194	best: 0.6274805 (182)	total: 39.1s	remaining: 29.2s
292:	learn: 0.6462294	test: 0.6248746	best: 0.6274805 (182)	total: 39.1s	remaining: 29s
293:	learn: 0.6464431	test: 0.6249060	best: 0.6274805 (182)	total: 39.1s	remaining: 28.8s
294:	learn: 0.6464431	test: 0.6249060	best: 0.6274805 (182)	total: 39.1s	remaining: 28.6s
295:	learn: 0.6466567	test: 0.6249060	best: 0.6274805 (182)	total: 39.2s	remaining: 28.3s
296:	learn: 0.6465812	test: 0.6252505	best: 0.6274805 (182)	total: 39.2s	remaining: 28.1s
297:	learn: 0.6465812	test: 0.6252505	best: 0.6274805 (182)	total: 39.2s	remaining: 27.9s
298:	learn: 0.6465812	test: 0.6252505	best: 0.6274805 (182)	total: 39.3s	remaining: 27.7s
299:	learn: 0.6465

385:	learn: 0.6508901	test: 0.6235913	best: 0.6274805 (182)	total: 52.7s	remaining: 16.4s
386:	learn: 0.6508207	test: 0.6239359	best: 0.6274805 (182)	total: 52.8s	remaining: 16.2s
387:	learn: 0.6508901	test: 0.6239359	best: 0.6274805 (182)	total: 52.8s	remaining: 16s
388:	learn: 0.6509595	test: 0.6239359	best: 0.6274805 (182)	total: 52.8s	remaining: 15.8s
389:	learn: 0.6505382	test: 0.6234352	best: 0.6274805 (182)	total: 52.9s	remaining: 15.7s
390:	learn: 0.6511083	test: 0.6241241	best: 0.6274805 (182)	total: 52.9s	remaining: 15.5s
391:	learn: 0.6510389	test: 0.6246246	best: 0.6274805 (182)	total: 52.9s	remaining: 15.3s
392:	learn: 0.6518171	test: 0.6246246	best: 0.6274805 (182)	total: 53s	remaining: 15.2s
393:	learn: 0.6519655	test: 0.6237797	best: 0.6274805 (182)	total: 53s	remaining: 15s
394:	learn: 0.6519655	test: 0.6237797	best: 0.6274805 (182)	total: 53s	remaining: 14.8s
395:	learn: 0.6520396	test: 0.6237797	best: 0.6274805 (182)	total: 53s	remaining: 14.6s
396:	learn: 0.6521832	

483:	learn: 0.6558704	test: 0.6250000	best: 0.6274805 (182)	total: 1m	remaining: 2.08s
484:	learn: 0.6562300	test: 0.6250312	best: 0.6274805 (182)	total: 1m	remaining: 1.94s
485:	learn: 0.6560869	test: 0.6250312	best: 0.6274805 (182)	total: 1m	remaining: 1.81s
486:	learn: 0.6560869	test: 0.6250312	best: 0.6274805 (182)	total: 1m	remaining: 1.68s
487:	learn: 0.6560869	test: 0.6250312	best: 0.6274805 (182)	total: 1m	remaining: 1.55s
488:	learn: 0.6558670	test: 0.6239680	best: 0.6274805 (182)	total: 1m	remaining: 1.42s
489:	learn: 0.6558704	test: 0.6240922	best: 0.6274805 (182)	total: 1m	remaining: 1.28s
490:	learn: 0.6567959	test: 0.6245614	best: 0.6274805 (182)	total: 1m	remaining: 1.15s
491:	learn: 0.6568659	test: 0.6243731	best: 0.6274805 (182)	total: 1m	remaining: 1.02s
492:	learn: 0.6567991	test: 0.6244049	best: 0.6274805 (182)	total: 1m	remaining: 895ms
493:	learn: 0.6567991	test: 0.6244049	best: 0.6274805 (182)	total: 1m	remaining: 766ms
494:	learn: 0.6568721	test: 0.6244049	best:

<catboost.core.CatBoostClassifier at 0x7f8ffc8f3df0>

In [318]:
y_pred = model_cb.predict(X_test)

In [319]:
f1_score(y_test, y_pred)

0.6274805325295153

In [320]:
accuracy_score(y_test, y_pred)

0.8481932644078206

## Random Forest Classification

In [325]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.8242399426758112

## Logistic Regression 

In [337]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.7812468011055379