In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

data = pd.read_csv("adult_data.csv")
data.head()

Unnamed: 0,age,work_class,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data = data.replace('?', np.nan)
data = data.dropna()
df = data.copy()
#one-hot encoding the last column.
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
df.tail()

Unnamed: 0,age,work_class,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,1


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#seperating features from target variable.
X = df.drop('income', axis=1)
Y = df['income']

categorical_cols = ['work_class', 'education', 'marital_status', 'occupation',
                   'relationship', 'race', 'sex', 'native_country']
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain',
                 'capital_loss', 'hours_per_week']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols), #standardizing numerical features.
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ])

partitions = [
    (0.2, 0.8, "20/80"),
    (0.5, 0.5, "50/50"),
    (0.8, 0.2, "80/20")
]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

results = []
for train_size, test_size, partition_name in partitions:
    print(f"\n{'='*50}")
    print(f"PARTITION: {partition_name} (Train/Test)")
    print(f"{'='*50}")

    current_partition_results = []
    #running 3 trials.
    for trial_number in range(3):
        print(f"Trial number: {trial_number + 1}")

        #splitting training and testing data, and then creating a Random Forest pipeline.
        X_training, X_testing, Y_training, Y_testing = train_test_split(X, Y, test_size=test_size, random_state=42+trial_number, stratify=Y)
        random_forest_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42 + trial_number))
        ])

        # turning Hyperparameters of the random forest classifier.
        parameter_grid = {
            'classifier__n_estimators': [100, 200],  # Number of trees in the forest
            'classifier__max_depth': [10, 20, None],  # Maximum depth of trees
            'classifier__min_samples_split': [2, 5],  # Minimum samples required to split a node
            'classifier__min_samples_leaf': [1, 2]   # Minimum samples required at a leaf node
        }

        #tuning hyperparameters
        grid_search = GridSearchCV(random_forest_pipeline, param_grid=parameter_grid, n_jobs=-1, verbose=0, cv=5, scoring='accuracy')
        grid_search.fit(X_training, Y_training)
        optimum_model = grid_search.best_estimator_ #finding best model.

        #getting predictions and calculating accuracy scores for training and testing data.
        Y_training_prediction = optimum_model.predict(X_training)
        Y_testing_prediction = optimum_model.predict(X_testing)
        training_accuracy = accuracy_score(Y_training, Y_training_prediction)
        testing_accuracy = accuracy_score(Y_testing, Y_testing_prediction)
        print(f"  Best params: {grid_search.best_params_}")
        print(f"  Train Accuracy: {training_accuracy:.4f}")
        print(f"  Test Accuracy: {testing_accuracy:.4f}")
        print(f"  CV Score: {grid_search.best_score_:.4f}")

        result = {
            'partition': partition_name,
            'trial': trial_number + 1,
            'training_accuracy': training_accuracy,
            'testing_accuracy': testing_accuracy,
            'cv_accuracy': grid_search.best_score_,
            'best_parameters': grid_search.best_params_,
        }
        current_partition_results.append(result)

   #calculating average accuracies across trials.
    avg_training_acc = np.mean([r["training_accuracy"] for r in current_partition_results])
    avg_test_acc = np.mean([r["testing_accuracy"] for r in current_partition_results])
    avg_cv_acc = np.mean([r["cv_accuracy"] for r in current_partition_results])
    avg_acc_over_trials = {
        'partition': partition_name,
        "avg_training_accuracy": avg_training_acc,
        "avg_testing_accuracy": avg_test_acc,
        "avg_cv_accuracy": avg_cv_acc,
        'partition_result': current_partition_results,
    }
    results.append(avg_acc_over_trials)


PARTITION: 20/80 (Train/Test)
Trial number: 1
  Best params: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
  Train Accuracy: 0.8956
  Test Accuracy: 0.8559
  CV Score: 0.8583
Trial number: 2
  Best params: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
  Train Accuracy: 0.8939
  Test Accuracy: 0.8519
  CV Score: 0.8631
Trial number: 3
  Best params: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
  Train Accuracy: 0.8990
  Test Accuracy: 0.8539
  CV Score: 0.8584

PARTITION: 50/50 (Train/Test)
Trial number: 1
  Best params: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
  Train Accuracy: 0.8885
  Test Accuracy: 0.8576
  CV Score: 0.8602
Trial 

In [9]:
#final summary of results
print("\n" + "="*60)
print("RANDOM FOREST - FINAL SUMMARY RESULTS")
print("="*60)

for result in results:
    print(f"\nPartition {result['partition']}:")
    print(f"  Average Training Accuracy: {result['avg_training_accuracy']:.4f}")
    print(f"  Average Test Accuracy: {result['avg_testing_accuracy']:.4f}")
    print(f"  Average CV Accuracy: {result['avg_cv_accuracy']:.4f}")

#saving random forest results to a csv file.
temp_results = []
for result in results:
    for partition_result in result['partition_result']:
        temp_results.append(partition_result)

rf_summary = pd.DataFrame(temp_results)
rf_summary.to_csv('random_forest_summary.csv', index=False)
print('results successfully saved to csv file!')


RANDOM FOREST - FINAL SUMMARY RESULTS

Partition 20/80:
  Average Training Accuracy: 0.8962
  Average Test Accuracy: 0.8539
  Average CV Accuracy: 0.8599

Partition 50/50:
  Average Training Accuracy: 0.8972
  Average Test Accuracy: 0.8565
  Average CV Accuracy: 0.8605

Partition 80/20:
  Average Training Accuracy: 0.9010
  Average Test Accuracy: 0.8587
  Average CV Accuracy: 0.8607
results successfully saved to csv file!
