In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,VotingClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('../train_test_files/processed_train.csv')

In [3]:
test_df  = pd.read_csv('../train_test_files/processed_test.csv')

In [4]:
x=test_df['response_id']
test_df.drop(['response_id'],axis=1,inplace=True)

In [5]:
clf1 = RandomForestClassifier(random_state=42)
clf2 = XGBClassifier(random_state=42)
# clf3 = AdaBoostClassifier(random_state=42)

In [6]:
voting_clf = VotingClassifier(
    estimators=[('rf', clf1), ('xgb', clf2)],
    voting='hard'  # 'soft' uses probabilities to vote; 'hard' uses predicted class labels
)

In [7]:
param_grid = {
    # RandomForest parameters
    'rf__n_estimators': randint(100, 500),                # Number of trees in the forest
    'rf__max_depth': [None, 10, 20, 30],                  # Maximum depth of the tree
    'rf__min_samples_split': randint(2, 11),              # Minimum samples required to split a node
    'rf__min_samples_leaf': randint(1, 5),                # Minimum samples required at each leaf node
    'rf__max_features': ['sqrt', 'log2', None],           # Number of features to consider at each split

    # XGBClassifier parameters
    'xgb__n_estimators': randint(100, 500),               # Number of boosting rounds
    'xgb__learning_rate': uniform(0.01, 0.2),             # Learning rate
    'xgb__max_depth': randint(3, 10),                     # Maximum depth of a tree
    'xgb__subsample': uniform(0.6, 0.4),                  # Subsample ratio
    'xgb__colsample_bytree': uniform(0.6, 0.4),           # Column subsample ratio
    'xgb__gamma': uniform(0, 0.3),                        # Minimum loss reduction for a split

}

In [8]:
random_search = RandomizedSearchCV(
    estimator=voting_clf,
    param_distributions=param_grid,
    n_iter=100,               # Number of random combinations to try
    cv=5,                    # 5-fold cross-validation
    scoring='f1',
    n_jobs=-1,               # Use all available cores
    random_state=42
)

In [9]:
random_search.fit(train_df.drop(['exit_status'],axis=1), train_df['exit_status'])
print(random_search.best_params_)

{'rf__max_depth': 20, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 9, 'rf__n_estimators': 288, 'xgb__colsample_bytree': np.float64(0.8387400631785948), 'xgb__gamma': np.float64(0.13374982585607734), 'xgb__learning_rate': np.float64(0.02999498316360058), 'xgb__max_depth': 5, 'xgb__n_estimators': 187, 'xgb__subsample': np.float64(0.7334834444556088)}


In [10]:
final = random_search.predict(test_df)

In [11]:
predictions = ["Stayed" if pred == 1 else "Left" for pred in final]

In [12]:
output_df = pd.DataFrame({
    'response_id': x,
    'Predictions': predictions
})

In [13]:
output_df.to_csv('../output/xgboost_random.csv', index=False)