In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier


In [7]:
# Load and preprocess the dataset
df = pd.read_csv('adult_dataset.csv')
df.dropna(inplace=True)

# Encode categorical features
label_encoders = {}
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

df['income'] = LabelEncoder().fit_transform(df['income'])

In [8]:

X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initialize and train AdaBoost
#ada_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42)
#ada_classifier.fit(X_train, y_train)
#y_pred_ada = ada_classifier.predict(X_test)

# Initialize and train AdaBoost
# Replace 'base_estimator' with 'estimator'
ada_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, random_state=42)
ada_classifier.fit(X_train, y_train)
y_pred_ada = ada_classifier.predict(X_test)



In [11]:
# Initialize and train Gradient Boosting
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_classifier.fit(X_train, y_train)
y_pred_gb = gb_classifier.predict(X_test)

In [12]:
# Initialize and train XGBoost
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)

In [13]:
# Initialize and train LightGBM
lgb_classifier = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
lgb_classifier.fit(X_train, y_train)
y_pred_lgb = lgb_classifier.predict(X_test)

[LightGBM] [Info] Number of positive: 6304, number of negative: 19744
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004833 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 691
[LightGBM] [Info] Number of data points in the train set: 26048, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242015 -> initscore=-1.141665
[LightGBM] [Info] Start training from score -1.141665


In [14]:
# Initialize and train CatBoost
catboost_classifier = CatBoostClassifier(n_estimators=100, learning_rate=0.1, depth=3, random_state=42, verbose=0)
catboost_classifier.fit(X_train, y_train)
y_pred_catboost = catboost_classifier.predict(X_test)

In [15]:
# Compare performance
def print_comparison(name, y_true, y_pred):
    print(f"{name}")
    print(f'Accuracy: {accuracy_score(y_true, y_pred):.2f}')
    print(classification_report(y_true, y_pred))
    print("-" * 50)

print_comparison("AdaBoost", y_test, y_pred_ada)
print_comparison("Gradient Boosting", y_test, y_pred_gb)
print_comparison("XGBoost", y_test, y_pred_xgb)
print_comparison("LightGBM", y_test, y_pred_lgb)
print_comparison("CatBoost", y_test, y_pred_catboost)

AdaBoost
Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4976
           1       0.77      0.58      0.66      1537

    accuracy                           0.86      6513
   macro avg       0.82      0.76      0.78      6513
weighted avg       0.85      0.86      0.85      6513

--------------------------------------------------
Gradient Boosting
Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.88      0.95      0.92      4976
           1       0.80      0.58      0.67      1537

    accuracy                           0.87      6513
   macro avg       0.84      0.77      0.79      6513
weighted avg       0.86      0.87      0.86      6513

--------------------------------------------------
XGBoost
Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4976
           1       0.79      0.57      0.66      1537

