# Training Model with XGBoost Model

The purpose of this notebook is to preprocess/clean the training data and then train the XGBoost model. 

## Data Preprocessing and Cleaning

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import warnings

warnings.filterwarnings("ignore")



In [2]:
# Load the dataset
train_df = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")
test_df = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")

# Identify categorical columns
categorical_cols = train_df.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
train_df = pd.get_dummies(train_df, columns=categorical_cols)
test_df = pd.get_dummies(test_df, columns=categorical_cols)

# Identify features and target variable
X = train_df.drop(['isFraud', 'TransactionID'], axis=1)
y = train_df['isFraud']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify missing values after the split
missing_cols = X_train.columns[X_train.isnull().any()]

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # You can choose a different strategy
X_train[missing_cols] = imputer.fit_transform(X_train[missing_cols])
X_test[missing_cols] = imputer.transform(X_test[missing_cols])

# Data preprocessing (scaling numerical features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

## Training Model

In [3]:
# Define desired hyperparameters
xgb_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.1,
    'max_depth': 20,
    'scale_pos_weight': 1,  # Set to 1 since oversampling has already balanced the classes
    'subsample': 0.9
}

# Create an instance of the XGBClassifier with specified parameters
model = XGBClassifier(**xgb_params, random_state=42)

# Train the model on the resampled data
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    113866
           1       0.89      0.58      0.70      4242

    accuracy                           0.98    118108
   macro avg       0.94      0.79      0.85    118108
weighted avg       0.98      0.98      0.98    118108

AUC-ROC Score: 0.7889323602812517


## Exporting Model

In [4]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(model, 'trained_model.joblib')

['trained_model.joblib']