In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import precision_recall_curve, auc, classification_report, roc_auc_score, average_precision_score
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Data Reading
def read_data(file_path, usecols=None):
    return pd.read_csv(file_path, usecols=usecols)

# Data Processing
def process_data(train_file, test_file, train_helper_file, test_helper_file):
    train_helper = read_data(train_helper_file)
    test_helper = read_data(test_helper_file)
    train = read_data(train_file)
    test = read_data(test_file)
    
    # Merge with helper data
    train = pd.merge(train, train_helper, on=['V1', 'V3'], how='left', suffixes=('', '_helper'))
    test = pd.merge(test, test_helper, on=['V1', 'V3'], how='left', suffixes=('', '_helper'))
    
    # Handle missing values
    train.fillna(-999, inplace=True)
    test.fillna(-999, inplace=True)
    
    return train, test

# Encoding Categorical Features
def encode_categorical(df):
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

# Prepare Data
def prepare_data(train, test):
    features = [col for col in train.columns if col not in ['Target', 'V2']]
    X = train[features]
    y = train['Target']
    X_test = test[features]

    # Ensure all features are numeric
    X = X.apply(pd.to_numeric, errors='coerce').fillna(-999)
    X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(-999)
    
    return X, y, X_test

# Objective Function for Bayesian Optimization
def xgb_objective(params, X_train, y_train):
    params = {
        'max_depth': int(params['max_depth']),
        'learning_rate': params['learning_rate'],
        'min_child_weight': params['min_child_weight'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'gamma': params['gamma'],
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'tree_method': 'hist',
        'n_jobs': -1
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    cv_results = xgb.cv(params, dtrain, num_boost_round=1000, nfold=3, 
                        early_stopping_rounds=50, metrics='auc', as_pandas=True)
    return cv_results['test-auc-mean'].max()

# Main Function
def main():
    train_file = "C:/Users/Nikhil Sukthe/Downloads/train.csv"
    test_file = "C:/Users/Nikhil Sukthe/Downloads/test.csv"
    train_helper_file = "C:/Users/Nikhil Sukthe/Downloads/train_helper.csv"
    test_helper_file = "C:/Users/Nikhil Sukthe/Downloads/test_helper.csv"

    # Data Processing
    train, test = process_data(train_file, test_file, train_helper_file, test_helper_file)
    train = encode_categorical(train)
    test = encode_categorical(test)
    X, y, X_test = prepare_data(train, test)

    # Handling Imbalanced Data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # Splitting Data for Validation
    X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # Hyperparameter Optimization
    pbounds = {
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'min_child_weight': (1, 10),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0),
        'gamma': (0, 5)
    }

    def wrapped_xgb_objective(max_depth, learning_rate, min_child_weight, subsample, colsample_bytree, gamma):
        params = {
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'min_child_weight': min_child_weight,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'gamma': gamma
        }
        return xgb_objective(params, X_train, y_train)

    optimizer = BayesianOptimization(f=wrapped_xgb_objective, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=5, n_iter=20)  # Increased iterations for better results

    best_params = optimizer.max['params']
    best_params['max_depth'] = int(best_params['max_depth'])

    # Train the Final Model
    model = xgb.XGBClassifier(**best_params, n_estimators=100, tree_method='hist', n_jobs=-1)
    model.fit(X_train, y_train)

    # Validation and Performance Evaluation
    y_valid_pred = model.predict_proba(X_valid)[:, 1]
    print(classification_report(y_valid, (y_valid_pred > 0.5).astype(int)))
    print(f"ROC AUC Score: {roc_auc_score(y_valid, y_valid_pred):.2f}")
    print(f"PR AUC Score: {average_precision_score(y_valid, y_valid_pred):.2f}")

    # Making Predictions on Test Data
    test_pred = model.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame({
        'V2': test['V2'],
        'Probability': test_pred,
        'Target': (test_pred > 0.5).astype(int)
    })

    

    submission.to_csv('submission.csv', index=False)

    precision, recall, thresholds = precision_recall_curve(y_valid, y_valid_pred)
    score = max(0, 100 * auc(recall, precision))
    print(f"AUC-PR Score: {score:.2f}")

if __name__ == "__main__":
    main()


|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9995   [39m | [39m0.6873   [39m | [39m4.754    [39m | [39m0.2223   [39m | [39m7.191    [39m | [39m2.404    [39m | [39m0.578    [39m |
| [39m2        [39m | [39m0.9993   [39m | [39m0.529    [39m | [39m4.331    [39m | [39m0.1843   [39m | [39m7.957    [39m | [39m1.185    [39m | [39m0.985    [39m |
| [39m3        [39m | [39m0.9992   [39m | [39m0.9162   [39m | [39m1.062    [39m | [39m0.06273  [39m | [39m4.284    [39m | [39m3.738    [39m | [39m0.7624   [39m |
| [39m4        [39m | [39m0.9993   [39m | [39m0.716    [39m | [39m1.456    [39m | [39m0.1874   [39m | [39m3.976    [39m | [39m3.629    [39m | [39m0.6832   [39m |
| [39m5        [39m | [39m0.9994   [39m | [39m0.728    [39m | [39m3.926    [39m | [