In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import Libraries**F

In [None]:
import scipy as sp
import scipy.stats as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
# Set color map to have light blue background
sns.set()
import statsmodels.formula.api as smf
import statsmodels.api as sm
%matplotlib inline

**Fetch Dataset**

In [None]:
#!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
census_income = fetch_ucirepo(id=20) 
  
# data (as pandas dataframes) 
X = census_income.data.features 
y = census_income.data.targets 

In [None]:
df = pd.DataFrame(X)
df.shape 
df = df.dropna()

Target Variable - Income Encoding

In [None]:
income_labels = y
dfy = pd.DataFrame(income_labels, columns=['income'])

dfy['income_encoded'] = dfy['income'].map({"<=50K": 0, "<=50K.": 0,
                                           ">50K": 1,  ">50K." : 1 })

print(dfy['income_encoded'].value_counts())

In [None]:
Remove Special Characters '?'

In [None]:
df = df[~df.map(lambda x: x == '?' if isinstance(x, str) else False).any(axis=1)]
df.shape # 45222 , 14
df.info()

Get the same indices of y as X

In [None]:
indices = [] 

for index_value in df.index:
    if index_value in dfy.index:  
        indices.append(index_value)

if indices:
    dfy = dfy.loc[indices]  
    print(dfy.shape)

In [None]:
numerical_df = df.select_dtypes(include=['float64', 'int64'])
df1 = pd.concat([numerical_df, dfy['income_encoded']], axis=1)
df1.describe()

In [None]:
Univariate Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1,3)
sns.boxplot(y=df["age"], ax=axes[0])
sns.boxplot(y=df["fnlwgt"], ax=axes[1])
sns.boxplot(y=df["education-num"], ax=axes[2])
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1,3)
sns.boxplot(y=df["capital-gain"], ax=axes[0])
sns.boxplot(y=df["capital-loss"], ax=axes[1])
sns.boxplot(y=df["hours-per-week"], ax=axes[2])
plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="white")

fig, axs = plt.subplots(1, 3, figsize=(15, 5))  # Adjusted figsize for better horizontal display

sns.histplot(data=df1, x="age", kde=True, color="skyblue", ax=axs[0])
sns.histplot(data=df1, x="education-num", kde=True, color="skyblue", ax=axs[1])
sns.histplot(data=df1, x="hours-per-week", kde=True, color="skyblue", ax=axs[2])

plt.tight_layout()  
plt.show()

In [None]:
df.columns

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

rel_counts = df['relationship'].value_counts().sort_values()  
rel_order = rel_counts.index
sns.countplot(x='relationship', data=df, ax=axes[0], order=rel_order, color="skyblue")  

ie_counts = df1['income_encoded'].value_counts().sort_values()
ie_order = ie_counts.index
sns.countplot(x='income_encoded', data=df1, ax=axes[1], order=ie_order, color="skyblue")
   
for ax in axes.flatten():
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
corr_matrix = df1.corr()
print(corr_matrix)

In [None]:
plt.figure(figsize=(8, 6)) # Adjust figure size as needed
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df1['age_enc'] = pd.cut(df1['age'], 
                  bins=5,  # 5 equal-width bins
            labels=['Very Young', 'Young', 'Middle-aged', 'Senior', 'Elderly'])

df1['age_enc'].value_counts()

In [None]:
age_inc = pd.crosstab(df1['age_enc'], df1['income_encoded'])
print(age_inc)

Chi Squared Test for Independence 

In [None]:
from scipy.stats import chi2_contingency
table = np.array(age_inc)
res = chi2_contingency(table)
print(res.statistic)
print(res.pvalue)

In [None]:
edn_inc = pd.crosstab(df1['education-num'], df1['income_encoded'])
table = np.array(edn_inc)
res = chi2_contingency(table)
print(res.statistic)
print(res.pvalue)

In [None]:
#df1.drop(columns=['fnlwgt'], inplace=True)
#df.columns
#numerical_df = df1.select_dtypes(include=['float64', 'int64'])
numerical_df.columns
numerical_df.drop(columns=['income_encoded', 'age_enc'], inplace=True)

Normalizing Numerical Columns

In [None]:
# Normalizing numerical columns
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
rs_sc =  scaler.fit_transform(numerical_df)

In [None]:
rs_sc_df = pd.DataFrame(rs_sc)

Encoding Catagorical Columns

In [None]:
ndf = df[['workclass', 'occupation', 'sex']]
result_df = pd.concat([rs_sc_df, ndf], join='inner',axis=1)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

wc_encoded = enc.fit_transform(result_df[['workclass']])
categories = enc.categories_[0]
workclass_df = pd.DataFrame(
    wc_encoded.toarray(),  
    columns=[f'workclass_{cat}' for cat in categories]
)

result_df = result_df.drop('workclass', axis=1)
result_df = pd.concat([result_df, workclass_df], join='inner',axis=1)

In [None]:
oc_encoded = enc.fit_transform(result_df[['occupation']])

categories = enc.categories_[0]

occupation_df = pd.DataFrame(
    oc_encoded.toarray(),  
    columns=[f'occupation_{cat}' for cat in categories]
)

result_df = result_df.drop('occupation', axis=1)
result_df = pd.concat([result_df, occupation_df], join='inner',axis=1)

In [None]:
gn_encoded = enc.fit_transform(result_df[['sex']])
categories = enc.categories_[0]

gn_df = pd.DataFrame(
    gn_encoded.toarray(),  
    columns=[f'gender_{cat}' for cat in categories]
)

result_df = result_df.drop('sex', axis=1)
result_df = pd.concat([result_df, gn_df], join='inner',axis=1)

In [None]:
#result_df.columns
result_df.drop(columns=[0,1,2,3,4], inplace=True)

In [None]:
idx = result_df.index.intersection(dfy.index)
result_df = result_df.loc[idx]
dfy = dfy.loc[idx]

In [None]:
result_df.columns = result_df.columns.astype(str)

In [None]:
dfy.columns
#dfy.drop(columns=['income'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(result_df, dfy, test_size=0.3, random_state=42)


In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100, learning_rate=5,random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred_tr = clf.predict(X_train)
y_pred_ts = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc_tr = accuracy_score(y_train, y_pred_tr) 
acc_ts = accuracy_score(y_test, y_pred_ts)  
print(acc_tr,acc_ts)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred_tr)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ts)

In [None]:
def misclassification_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, y_pred)

import matplotlib.pyplot as plt
import pandas as pd

n_estimators = 100

boosting_errors = pd.DataFrame(
    {
        "Number of trees": range(1, n_estimators + 1),
        "Test": [
            misclassification_error(y_test, y_pred_ts)
            for y_pred_ts in clf.staged_predict(X_test)
        ],
        "Train": [
            misclassification_error(y_train, y_pred_tr)
            for y_pred_tr in clf.staged_predict(X_train)
        ]
    }
).set_index("Number of trees")
ax = boosting_errors.plot()
ax.set_ylabel("Misclassification error on test and train set")
ax.set_title("AdaBoost algorithm")

# Calculate the misclassification error values for test and train sets
test_error = misclassification_error(y_test, clf.predict(X_test))
train_error = misclassification_error(y_train, clf.predict(X_train))

# Plot horizontal lines with the calculated error values
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [test_error, test_error],  # Use the calculated test error value
    color="tab:orange",
    linestyle="dashed",
)
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [train_error, train_error],  # Use the calculated train error value
    color="c",
    linestyle="dotted",
)
#plt.legend(["AdaBoost Classifier"], loc=1)
plt.show()G

Grid Search and Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier  # Import the base estimator
from sklearn.metrics import accuracy_score

abc = AdaBoostClassifier(random_state=42)

param_grid = {
    'n_estimators': [25, 50, 75, 100],
    'learning_rate': [1, 2, 5, 10,  30]
}

grid_search = GridSearchCV(
    estimator=abc,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=5,                
    n_jobs=-1           
)

grid_search.fit(X_train, y_train)

print(f"Best parameters : {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Evaluate the best estimator on the test set
best_adaboost_model = grid_search.best_estimator_
y_pred = best_adaboost_model.predict(X_test)

print("\nTest set evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

In [None]:
def misclassification_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, y_pred)

import matplotlib.pyplot as plt
import pandas as pd

n_estimators = 25

boosting_errors = pd.DataFrame(
    {
        "Number of trees": range(1, n_estimators + 1),
        "Test": [
            misclassification_error(y_test, y_pred_ts)
            for y_pred_ts in best_adaboost_model.staged_predict(X_test)
        ],
        "Train": [
            misclassification_error(y_train, y_pred_tr)
            for y_pred_tr in best_adaboost_model.staged_predict(X_train)
        ]
    }
).set_index("Number of trees")
ax = boosting_errors.plot()
ax.set_ylabel("Misclassification error plot of best_adaboost_model")
ax.set_title("AdaBoost algorithm")

# Calculate the misclassification error values for test and train sets
test_error = misclassification_error(y_test, best_adaboost_model.predict(X_test))
train_error = misclassification_error(y_train, best_adaboost_model.predict(X_train))

# Plot horizontal lines with the calculated error values
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [test_error, test_error],  # Use the calculated test error value
    color="tab:orange",
    linestyle="dashed",
)
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [train_error, train_error],  # Use the calculated train error value
    color="c",
    linestyle="dotted",
)

plt.show()

In [None]:
y_pred_tr = best_adaboost_model.predict(X_train)
y_pred_ts = best_adaboost_model.predict(X_test)

In [None]:
acc_tr = accuracy_score(y_train, y_pred_tr) 
acc_ts = accuracy_score(y_test, y_pred_ts)  
print(acc_tr,acc_ts)

In [None]:
from sklearn.metrics import classification_report
classification_report(y_test, y_pred_ts)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred_tr)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ts)

Balanced Data

In [None]:
# Balanced Data
indices_1 = dfy[dfy['income_encoded'] == 1].index
indices_0 = dfy[dfy['income_encoded'] == 0].index

In [None]:
#dfy.columns
#print(len(indices_0), len(indices_1))
indices_0 = indices_0[:9089]

In [None]:
indices_1_df = pd.DataFrame(indices_1)
indices_0_df = pd.DataFrame(indices_0)

combined_array = pd.concat([indices_1_df, indices_0_df], axis=1)

dfb = dfy.loc[combined_array.values.flatten()]
dfb = dfb.dropna()

In [None]:
idx = dfb.index.intersection(result_df.index)
result_df = result_df.loc[idx]
dfb = dfb.loc[idx]

In [None]:
from sklearn.model_selection import train_test_split
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(result_df, dfb, test_size=0.3, random_state=42)


In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=50, learning_rate=10,random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred_tr = clf.predict(X_train)
y_pred_ts = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc_tr = accuracy_score(y_train, y_pred_tr) 
acc_ts = accuracy_score(y_test, y_pred_ts)  
print(acc_tr,acc_ts)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_pred_tr)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_ts)

In [None]:
from sklearn.metrics import classification_report
classification_report(y_test, y_pred_ts)

In [None]:
from sklearn.metrics import classification_report
classification_report(y_test, y_pred_ts)

In [None]:
def misclassification_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, y_pred)

import matplotlib.pyplot as plt
import pandas as pd

n_estimators = 50

boosting_errors = pd.DataFrame(
    {
        "Number of trees": range(1, n_estimators + 1),
        "Test": [
            misclassification_error(y_test, y_pred_ts)
            for y_pred_ts in clf.staged_predict(X_test)
        ],
        "Train": [
            misclassification_error(y_train, y_pred_tr)
            for y_pred_tr in clf.staged_predict(X_train)
        ]
    }
).set_index("Number of trees")
ax = boosting_errors.plot()
ax.set_ylabel("Misclassification error on test and train set")
ax.set_title("AdaBoost algorithm")

# Calculate the misclassification error values for test and train sets
test_error = misclassification_error(y_test, clf.predict(X_test))
train_error = misclassification_error(y_train, clf.predict(X_train))

# Plot horizontal lines with the calculated error values
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [test_error, test_error],  # Use the calculated test error value
    color="tab:orange",
    linestyle="dashed",
)
plt.plot(
    [boosting_errors.index.min(), boosting_errors.index.max()],
    [train_error, train_error],  # Use the calculated train error value
    color="c",
    linestyle="dotted",
)
#plt.legend(["AdaBoost Classifier"], loc=1)
plt.show()