In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import ADASYN
from scipy.stats import randint
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load data
train_path = 'train.csv'
test_path = 'test.csv'

In [None]:
# Load datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
# Preprocessing
train.columns = train.columns.str.strip().str.replace(' ', '')
train["income"] = train["income"].str.strip()
test.columns = test.columns.str.strip().str.replace(' ', '')

train = train.replace('?', np.nan)
test = test.replace('?', np.nan)
train.dropna(inplace=True)

In [None]:
# Menambahkan fitur baru 'usia' dari 'TahunKelahiran'
train['usia'] = 2024 - train['TahunKelahiran']
test['usia'] = 2024 - test['TahunKelahiran']

cat_columns = ['KelasPekerjaan', 'Pendidikan', 'JenjangPendidikan', 'Status', 'Pekerjaan', 'Hubungan', 'Etnis', 'sex', 'AsalNegara']
df_dumy_train = pd.get_dummies(train, columns=cat_columns)
df_dumy_test = pd.get_dummies(test, columns=cat_columns)

df_dumy_test = df_dumy_test.reindex(columns=df_dumy_train.columns, fill_value=0)

X = df_dumy_train.drop("income", axis=1)
y = df_dumy_train["income"].apply(lambda x: 1 if x == '>50K' else 0)

In [None]:
# Balancing dengan ADASYN
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)

In [None]:
# Membagi data
X_train, X_val, y_train, y_val = train_test_split(X_adasyn, y_adasyn, test_size=0.1, random_state=101)

In [None]:
# Scaling fitur
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_val = scaler.transform(X_val)
scaled_X_test = scaler.transform(df_dumy_test.drop("income", axis=1, errors='ignore'))

In [None]:
# Tuning Hyperparameter dengan RandomizedSearchCV untuk RandomForestClassifier
rf = RandomForestClassifier(random_state=42, class_weight='balanced')  # Tambahkan class_weight
param_dist_rf = {
    'n_estimators': randint(100, 1000),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': randint(10, 150),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

In [None]:
# Tambahkan progress bar dengan tqdm
random_search_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf,
    n_iter=200, cv=10, random_state=42, n_jobs=-1, verbose=1
)


In [None]:
# Progress tracking saat training model
print("Training model dengan RandomizedSearchCV...")
with tqdm(total=200) as pbar:
    random_search_rf.fit(scaled_X_train, y_train)
    pbar.update(200)

best_rf = random_search_rf.best_estimator_


Training model dengan RandomizedSearchCV...


  0%|          | 0/200 [00:00<?, ?it/s]

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


In [None]:
# Fine-tuning lebih lanjut dengan GridSearchCV
param_grid_rf = {
    'n_estimators': [best_rf.n_estimators - 50, best_rf.n_estimators, best_rf.n_estimators + 50],
    'max_depth': [best_rf.max_depth - 10, best_rf.max_depth, best_rf.max_depth + 10],
    'min_samples_split': [best_rf.min_samples_split - 2, best_rf.min_samples_split, best_rf.min_samples_split + 2],
    'min_samples_leaf': [best_rf.min_samples_leaf - 1, best_rf.min_samples_leaf, best_rf.min_samples_leaf + 1]
}

grid_search_rf = GridSearchCV(best_rf, param_grid=param_grid_rf, cv=10, n_jobs=-1, verbose=2)
grid_search_rf.fit(scaled_X_train, y_train)

In [None]:
# Ambil model terbaik setelah GridSearchCV
best_rf = grid_search_rf.best_estimator_

In [None]:
# Evaluasi performa model pada validation set
val_predictions = best_rf.predict(scaled_X_val)
print("F1 Score (Validation Set):", f1_score(y_val, val_predictions))
print(classification_report(y_val, val_predictions))

ConfusionMatrixDisplay.from_estimator(best_rf, scaled_X_val, y_val)
plt.show()

In [None]:
# Prediksi pada data test dengan progress bar
print("Melakukan prediksi pada data test...")
with tqdm(total=len(scaled_X_test)) as pbar:
    test_predictions = best_rf.predict(scaled_X_test)
    pbar.update(len(scaled_X_test))

In [None]:
# Buat DataFrame untuk submission
submission = pd.DataFrame({
    'ID': test['ID'],
    'income': ['1' if pred == 1 else 0 for pred in test_predictions]
})

submission.to_csv('submission.csv', index=False)
print("Submission file created: 'submissionF.csv'")