In [None]:
import pandas as pd
import numpy as np

# 1. Importing the dataset and checking for null values

In [None]:
df = pd.read_csv("/kaggle/input/census-income-data/CENSUS_INCOME.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.isnull().sum()

# 2. Checking for the number of unique labels in the dataset.

In [None]:
for i in df[:]:
    print(i, ":", len(df[i].unique()), "Labels")

# 3. Working on Dataset

In [None]:
df.head()

In [None]:
df['OCCUPATION'].value_counts()

## 3.1 Removing the '?' value

In [None]:
df.replace(" ?", pd.NA, inplace=True)

In [None]:
df.head()

## 3.2 Now dropping all the null value

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.dropna(how='any', inplace=True)

In [None]:
df.shape

## 3.3 Feature Engineering

In [None]:
df.head()

In [None]:
df['INCOME'].value_counts()

In [None]:
df.dtypes

#### Here we can do categorical encoding to "INCOME" column

In [None]:
dict = {
    ' <=50K': 0,
    ' >50K': 1
}

In [None]:
df['INCOME'] = df['INCOME'].map(dict)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#### We can perform label encoding to all the categorical features

In [None]:
df['RACE_encode'] = le.fit_transform(df['RACE']).astype('object')
df['WORKCLASS_encode'] = le.fit_transform(df['WORKCLASS']).astype('object')
df['EDUCATION_encode'] = le.fit_transform(df['EDUCATION']).astype('object')
df['MARITAL-STATUS_encode'] = le.fit_transform(df['MARITAL-STATUS']).astype('object')
df['OCCUPATION_encode'] = le.fit_transform(df['OCCUPATION']).astype('object')
df['RELATIONSHIP_encode'] = le.fit_transform(df['RELATIONSHIP']).astype('object')
df['SEX_encode'] = le.fit_transform(df['SEX']).astype('object')
df['NATIVE COUNTRY_encode'] = le.fit_transform(df['NATIVE COUNTRY']).astype('object')


In [None]:
df.head()

In [None]:
df.columns

In [None]:
df2 = df[['AGE',
#           'WORKCLASS', 
          'FNLWGT',
#           'EDUCATION', 
          'EDUCATION-NUM',
#        'MARITAL-STATUS', 'OCCUPATION', 'RELATIONSHIP', 'RACE', 'SEX',
       'CAPITAL-GAIN', 'CAPITAL-LOSS', 'HOURS-PER-WEEK'
#           'NATIVE COUNTRY',
       , 'RACE_encode', 'WORKCLASS_encode', 'EDUCATION_encode',
       'MARITAL-STATUS_encode', 'OCCUPATION_encode', 'RELATIONSHIP_encode',
       'SEX_encode', 'NATIVE COUNTRY_encode', 'INCOME']]

In [None]:
df2.head()

In [None]:
df.columns

In [None]:
df = df[['AGE', 'WORKCLASS', 'FNLWGT', 'EDUCATION', 'EDUCATION-NUM',
       'MARITAL-STATUS', 'OCCUPATION', 'RELATIONSHIP', 'RACE', 'SEX',
       'CAPITAL-GAIN', 'CAPITAL-LOSS', 'HOURS-PER-WEEK', 'NATIVE COUNTRY',
       'INCOME']]
df.head()

## 3.3 EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.hist(df['AGE'], bins=100)
plt.xlabel("Age")
plt.show()

In [None]:
df_count = df.groupby(['AGE', 'OCCUPATION']).size().reset_index(name='COUNT')
df_pivot = df_count.pivot(index='AGE', columns='OCCUPATION', values='COUNT')

In [None]:
ax = df_pivot.plot(kind='bar', stacked=True, figsize=(13, 9))
ax.set_ylabel('Count')
ax.set_xlabel('Age')
ax.set_title("Bar plot of Age and Occupation")
plt.show()

In [None]:
plt.figure(figsize=(20, 9))
plt.subplot(2, 1, 1)
sns.countplot(x='EDUCATION', data=df, palette='Set3', hue='SEX')
plt.title("Count of male and female educated")
plt.subplot(2, 1, 2)
sns.countplot(x='EDUCATION', data=df, palette='Set2', hue='OCCUPATION')
plt.show()

In [None]:
df_mat_stats = df.groupby(df['MARITAL-STATUS']).size().reset_index(name='COUNT')
df_mat_stats
plt.pie(df_mat_stats['COUNT'], labels=df_mat_stats['MARITAL-STATUS'])
plt.title("Pie chart of marital status in dataset")
plt.show()

In [None]:
plt.figure(figsize=(13, 5))
sns.countplot(data=df, x='WORKCLASS', hue='SEX', palette = 'Set2')
plt.title("Count of working male and female")
plt.show()

In [None]:
plt.hist(df['HOURS-PER-WEEK'], bins=20)
plt.xlabel("Hours per week")
plt.ylabel("Count")
plt.title("Normal Distribution of working hours")
plt.show()

In [None]:
df.head()

In [None]:
plt.figure(figsize=(20, 7))
plt.plot(df.groupby(['OCCUPATION'])['HOURS-PER-WEEK'].mean())
plt.xticks(fontsize=8)
plt.show()

## 4. Machine Learning

#### Here we are going to predict that whether a person will have an income <=50K (0) or >50K (1)

In [None]:
df2.head()

### 4.1. Scalerization of values which are numerical using robust scaler as there are few outliers

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
scaler = RobustScaler()

In [None]:
df2[['AGE', 'FNLWGT', 'EDUCATION-NUM', 'CAPITAL-GAIN', 'CAPITAL-LOSS', 'HOURS-PER-WEEK']] = scaler.fit_transform(df2[['AGE', 'FNLWGT', 'EDUCATION-NUM', 'CAPITAL-GAIN', 'CAPITAL-LOSS', 'HOURS-PER-WEEK']])

In [None]:
df2.head()

### 4.2 Now lets begin with finding the best parameters for our dataset

### 4.2.1 For this firstly let's import the classification based algorithms which are required

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier, DMatrix
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### 4.2.2 Make dependent and independent variables and then divide them into train test split

In [None]:
#We will use our df2 dataset which we have prepared for applying ML
df2.head()

In [None]:
X = df2.drop('INCOME', axis=1) #Independent Variables
y = df2['INCOME'] #Dependent Variables

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 4.2.3 Now we will define parameters for all our ML models for optuna

In [None]:
def objective_rf(trials):
    params = {
        'n_estimators' : trials.suggest_int('n_estimators', 50, 1000),
        'criterion': trials.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
        'max_depth': trials.suggest_int('max_depth', 5, 10),
        'min_samples_split': trials.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trials.suggest_int('min_samples_leaf', 2, 10),
        'max_features': trials.suggest_categorical('max_features', ['sqrt', 'log2', None])
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=100)
best_params_rf = study_rf.best_trial.params
print("Beat Hyperparameter:", best_params_rf)

In [None]:
def objective_gf(trial):
    params = {
        'loss' : trial.suggest_categorical('loss', ['deviance', 'exponential']),
        'learning_rate' : trial.suggest_uniform('learning_rate', 0, 0.1),
        'n_estimators' : trial.suggest_int('n_estimators', 50, 1000),
        'subsample' : trial.suggest_uniform('subsample', 0.1, 1),
        'min_samples_split' : trial.suggest_uniform('min_samples_split', 0, 1),
        'min_samples_leaf': trial.suggest_uniform('min_samples_leaf', 0, 1),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }
    model = GradientBoostingClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gf, n_trials=100)
best_params_gb = study_gb.best_trial.params
print("Best Hyperparameter:", best_params_gb)

In [None]:
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'eta': trial.suggest_uniform('eta', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1)
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train, enable_categorical=True)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100)
best_params_xgb = study_xgb.best_trial.params
print("Best Hyperparameter:", best_params_xgb)

In [None]:
rf = RandomForestClassifier(**best_params_rf)
gb = GradientBoostingClassifier(**best_params_gb)
xgb = XGBClassifier(**best_params_xgb)

In [None]:
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[('rf', rf), ('gb', gb), ('xgb', xgb)], voting='soft')

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

In [None]:
from sklearn.metrics import roc_auc_score
auc_score=[]
submission_predictions = []
X_test = X
cat_columns = ['RACE_encode', 'WORKCLASS_encode', 'EDUCATION_encode', 'MARITAL-STATUS_encode', 'OCCUPATION_encode', 'RELATIONSHIP_encode', 'SEX_encode', 'NATIVE COUNTRY_encode']
X[cat_columns] = X[cat_columns].astype('int64')

for train_index, test_index in kf.split(X, y):
    X_train_kf, X_valid_kf = X.iloc[train_index], X.iloc[test_index]
    y_train_kf, y_valid_kf = y.iloc[train_index], y.iloc[test_index]
    
    vote.fit(X_train_kf, y_train_kf)
    
    y_pred_kf = vote.predict_proba(X_valid_kf)[:, 1]
    auc_val = roc_auc_score(y_valid_kf, y_pred_kf)
    auc_score.append(auc_val)
    
    y_pred_test = vote.predict_proba(X_test)[:, 1]
    submission_predictions.append(y_pred_test)
    

In [None]:
for i, score in enumerate(auc_score):
    print(f"AUC for validation set {i}: {score}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
vote.fit(X_train, y_train)

In [None]:
y_pred = vote.predict(X)

In [None]:
accuracy_score(y, y_pred)