In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load the Training Dataset

train_features_df = pd.read_csv('train_features.csv')
train_labels_df = pd.read_csv('train_labels.csv')
test_ids = pd.read_csv('test_ids.csv')
test_feature_df = pd.read_csv('test_features.csv')

# Extract target variable
train_labels = train_labels_df['y']
#train_data = train_data.drop(columns=['id', 'y'])


In [3]:
# Model Training and Evaluation

# replace or remove NaNs and Inf values
train_features_df.replace([np.inf, -np.inf], np.nan, inplace=True)
#train_features_df.fillna(train_features_df.mean(), inplace=True)
test_feature_df.replace([np.inf, -np.inf], np.nan, inplace=True)
#test_feature_df.fillna(test_feature_df.mean(), inplace=True)
imp = IterativeImputer(n_nearest_features=15, max_iter=50)

train_features_df = imp.fit_transform(train_features_df)
test_feature_df = imp.transform(test_feature_df)


param_dist_xgb = {
    "n_estimators": [700],
    "learning_rate": uniform(0.01, 0.3),
    "max_depth": [3, 5, 7],
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
}

random_search_xgb = RandomizedSearchCV(
    estimator=XGBClassifier(
        random_state=42, n_jobs=-1, eval_metric="mlogloss"
    ),
    param_distributions=param_dist_xgb,
    n_iter=20,
    scoring="accuracy",
    cv=3,
    random_state=42,
    n_jobs=-1,
)

random_search_xgb.fit(train_features_df, train_labels)

print("Best Parameters for XGBClassifier:", random_search_xgb.best_params_)

best_xgb = random_search_xgb.best_estimator_
bagging_clf = BaggingClassifier(estimator=best_xgb, n_estimators=150, random_state=42)

bagging_clf.fit(train_features_df, train_labels)

y_pred = bagging_clf.predict(train_features_df)
print(classification_report(train_labels, y_pred))

Best Parameters for XGBClassifier: {'colsample_bytree': np.float64(0.9208787923016158), 'learning_rate': np.float64(0.032365193103931246), 'max_depth': 7, 'n_estimators': 700, 'subsample': np.float64(0.7693605922825478)}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3030
           1       1.00      1.00      1.00       443
           2       1.00      1.00      1.00      1474
           3       1.00      1.00      1.00       170

    accuracy                           1.00      5117
   macro avg       1.00      1.00      1.00      5117
weighted avg       1.00      1.00      1.00      5117



In [4]:
# Predictions on Test Set
test_predictions = bagging_clf.predict(test_feature_df)

submission = pd.DataFrame({
    'id': test_ids['id'], 
    'y': test_predictions
})

submission.to_csv('submission_f.csv', index=False)
print("Predictions saved to submission_f.csv")

Predictions saved to submission.csv
