I am testing the adult dataset to see what can work with it.

In [13]:
import pandas as pd
import numpy as np

#columns names to structure data.
columns = [
    "age", "work_class", "fnlwgt", "education", "education_num", "marital_status",
    "occupation", "relationship", "race", "sex", "capital_gain",
    "capital_loss", "hours_per_week", "native_country", "income"
]
#adding column names and converting to a csv file.
data = pd.read_csv("adult/adult.txt", header=None, engine="python", sep=r"\s*,\s*")
data.columns = columns
data.to_csv("adult_data.csv", index=False)
data.head()


Unnamed: 0,age,work_class,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


doing binary classification to figure out the which income bracket each individual falls in. Using boosting (XG-boost) trees to achieve this task.

In [14]:
# data preprocessing - handling missing values.
data = data.replace('?', np.nan)
data = data.dropna()
df = data.copy()
#one-hot encoding the last column.
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
df.tail()

Unnamed: 0,age,work_class,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,1


In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
#creating two dataframes to separate the features from the target variable.
X = df.drop('income', axis=1)
Y = df['income']
categorical_cols = ['work_class', 'education', 'marital_status', 'occupation',
                       'relationship', 'race', 'sex', 'native_country']
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain',
                     'capital_loss', 'hours_per_week']
preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols), #standardizing numerical features.
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
        ])
partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20")
]

In [17]:
#XG-boost classification code.
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

results = []
for train_size, test_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"PARTITION: {partition_name} (Train/Test)")
    print(f"{'='*50}")

    current_partition_results = []
    #running 3 trials.
    for trial_number in range(3):
        print(f"Trial number: {trial_number + 1}")

         #splitting training and testing data, and then creating a XG boost pipeline.
        X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=test_size, random_state=42+trial_number, stratify=Y)
        xgb_pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', xgb.XGBClassifier(random_state=42 + trial_number, eval_metric='logloss'))])

        #hyperparamters
        parameter_grid = {
            'classifier__n_estimators' : [100, 200],
            'classifier__max_depth' : [3, 4, 5],
            'classifier__learning_rate' : [0.1, 0.2],
            'classifier__subsample' : [0.8, 1.0],
        }
        #tuning hyperparameters
        grid_search = GridSearchCV(xgb_pipeline, param_grid=parameter_grid, n_jobs=-1, verbose=0, cv=5, scoring='accuracy')
        grid_search.fit(X_train, Y_train)
        best_model = grid_search.best_estimator_ #finding best model.

        y_training_prediction = best_model.predict(X_train)
        y_testing_prediction = best_model.predict(X_test)

        training_accuracy = accuracy_score(Y_train, y_training_prediction)
        testing_accuracy = accuracy_score(Y_test, y_testing_prediction)
        print(f"  Best params: {grid_search.best_params_}")
        print(f"  Train Accuracy: {training_accuracy:.4f}")
        print(f"  Test Accuracy: {testing_accuracy:.4f}")
        print(f"  CV Score: {grid_search.best_score_:.4f}")

        result = {
            'partition': partition_name,
            'trial': trial_number + 1,
            'training_accuracy': training_accuracy,
            'testing_accuracy': testing_accuracy,
            'cv_score': grid_search.best_score_,
            'best_parameters': grid_search.best_params_,
        }
        current_partition_results.append(result)

    #average training_testing and cross_validation accuracy for all trials for this partition.
    avg_train = np.mean([r["training_accuracy"] for r in current_partition_results])
    avg_test = np.mean([r["testing_accuracy"] for r in current_partition_results])
    avg_cv = np.mean([r["cv_score"] for r in current_partition_results])

    avg_score_summary = {
        'partition': partition_name,
        'avg_train_accuracy': avg_train,
        'avg_test_accuracy': avg_test,
        'avg_cv_score': avg_cv,
        'trials': current_partition_results,
    }

    results.append(avg_score_summary)



PARTITION: 20/80 (Train/Test)
Trial number: 1
  Best params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 4, 'classifier__n_estimators': 100, 'classifier__subsample': 1.0}
  Train Accuracy: 0.8987
  Test Accuracy: 0.8624
  CV Score: 0.8664
Trial number: 2
  Best params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 1.0}
  Train Accuracy: 0.8856
  Test Accuracy: 0.8598
  CV Score: 0.8675
Trial number: 3
  Best params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200, 'classifier__subsample': 1.0}
  Train Accuracy: 0.8858
  Test Accuracy: 0.8620
  CV Score: 0.8682

PARTITION: 50/50 (Train/Test)
Trial number: 1
  Best params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 4, 'classifier__n_estimators': 100, 'classifier__subsample': 1.0}
  Train Accuracy: 0.8836
  Test Accuracy: 0.8649
  CV Score: 0.8694
Trial number: 2
  Best params: {'class

printing final summary of XG boost classifier results on dataset 1.

In [18]:
#summary of results
print("\n" + "="*60)
print("Summary of final results for XG boost")
print("="*60)

for result in results:
    print(f"\nPartition {result['partition']}:")
    print(f"  Average Training Accuracy: {result['avg_train_accuracy']:.4f}")
    print(f"  Average Test Accuracy: {result['avg_test_accuracy']:.4f}")
    print(f"  Average CV Score: {result['avg_cv_score']:.4f}")


Summary of final results for XG boost

Partition 20/80:
  Average Training Accuracy: 0.8900
  Average Test Accuracy: 0.8614
  Average CV Score: 0.8674

Partition 50/50:
  Average Training Accuracy: 0.8849
  Average Test Accuracy: 0.8649
  Average CV Score: 0.8701

Partition 80/20:
  Average Training Accuracy: 0.8845
  Average Test Accuracy: 0.8677
  Average CV Score: 0.8703


Sanity check to see if overfitting occurs (training accuracy becomes too high!).

In [19]:
#Sanity Check
print("\n" + "="*60)
print("Starting Sanity Check")
print("="*60)
sanity_check_results = []

for result in results:
    for trial in result["trials"]:
        sanity_check_results.append(trial)
sanity_check_df = pd.DataFrame(sanity_check_results)
sanity_check_df.to_csv('xg_boost_results.csv', index = False)
print("results saved successfully")


Starting Sanity Check
results saved successfully


End of xgboost code for first dataset.