In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Load dataset
train = pd.read_csv("C:/Users/Ayush Pandita/OneDrive/Desktop/YR 3 SEM 6/AOML/ASS3/data/train.csv")
test = pd.read_csv("C:/Users/Ayush Pandita/OneDrive/Desktop/YR 3 SEM 6/AOML/ASS3/data/test.csv")

# Target variable
y = train['target']
X = train.drop(columns=['id', 'target'])
test_ids = test['id']
test = test.drop(columns=['id'])

# Label Encoding for categorical variables
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_data = pd.concat([X[col], test[col]], axis=0)
    le.fit(combined_data)
    X[col] = le.transform(X[col])
    test[col] = le.transform(test[col])

# One-Hot Encoding for low-cardinality categorical variables
X = pd.get_dummies(X, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Align train and test features
test = test.reindex(columns=X.columns, fill_value=0)
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# Train-validation split
# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline Model (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict_proba(X_valid)[:, 1]
print("Random Forest AUC:", roc_auc_score(y_valid, rf_preds))

# Hyperparameter tuning (Randomized Search)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_random = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist, n_iter=10, cv=3, scoring='roc_auc', random_state=42)
rf_random.fit(X_train, y_train)
print("Best parameters (RF):", rf_random.best_params_)

# Train XGBoost with best params
xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict_proba(X_valid)[:, 1]
print("XGBoost AUC:", roc_auc_score(y_valid, xgb_preds))

# Predict on test set
final_preds = xgb_model.predict_proba(test)[:, 1]
submission = pd.DataFrame({"id": test_ids, "target": final_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file saved!")


Random Forest AUC: 0.7196611976912743
Best parameters (RF): {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 20}
XGBoost AUC: 0.7636845380588324
Submission file saved!
