## Model Evaluation

In [1]:
from imblearn.over_sampling import RandomOverSampler

# Resample with partial oversampling
ros = RandomOverSampler(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Adjust scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]) / 2  # Smaller adjustment

# Train XGBoost
model = XGBClassifier(
    max_depth=10,
    n_estimators=100,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    early_stopping_rounds=10
)

model.fit(
    X_resampled, y_resampled,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# Predictions and threshold tuning
y_pred_proba = model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.6  # Example threshold based on analysis
y_pred = (y_pred_proba > optimal_threshold).astype(int)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [2]:
from sklearn.metrics import precision_recall_curve

# Reduced oversampling ratio
ros = RandomOverSampler(sampling_strategy=0.3, random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Adjusted XGBoost parameters
model = XGBClassifier(
    max_depth=10,
    n_estimators=100,
    learning_rate=0.1,
    scale_pos_weight=1,  # Adjusted weight
    random_state=42
)

# Train the model
model.fit(X_resampled, y_resampled)

# Predictions with threshold tuning
y_pred_proba = model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.6  # Example threshold
y_pred = (y_pred_proba > optimal_threshold).astype(int)

# Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [4]:
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Analyze precision-recall trade-offs
plt.plot(thresholds, precisions[:-1], label="Precision")
plt.plot(thresholds, recalls[:-1], label="Recall")
plt.xlabel("Threshold")
plt.legend()
plt.show()


In [5]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_resample, y_resample = smote_enn.fit_resample(X_train, y_train)


params = {
    'max_depth': [6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 100, 200],
    'scale_pos_weight': [1, 2, 5],
}

grid_search = GridSearchCV(XGBClassifier(random_state=42), param_grid=params, scoring='roc_auc', cv=3)
grid_search.fit(X_resample, y_resample)

print(grid_search.best_params_)

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,15d2e414-8735-46fc-9e02-80b472b2580f,d1b87f62-51b2-493b-ad6a-77e0fe13e785,58.09,2024-02-20 05:58:41,bank transfer,electronics,1,17,Amandaborough,tablet,212.195.49.198,Unit 8934 Box 0058\nDPO AA 05437,Unit 8934 Box 0058\nDPO AA 05437,0,30,5
1,0bfee1a0-6d5e-40da-a446-d04e73b1b177,37de64d5-e901-4a56-9ea0-af0c24c069cf,389.96,2024-02-25 08:09:45,debit card,electronics,2,40,East Timothy,desktop,208.106.249.121,"634 May Keys\nPort Cherylview, NV 75063","634 May Keys\nPort Cherylview, NV 75063",0,72,8
2,e588eef4-b754-468e-9d90-d0e0abfc1af0,1bac88d6-4b22-409a-a06b-425119c57225,134.19,2024-03-18 03:42:55,PayPal,home & garden,2,22,Davismouth,tablet,76.63.88.212,"16282 Dana Falls Suite 790\nRothhaven, IL 15564","16282 Dana Falls Suite 790\nRothhaven, IL 15564",0,63,3
3,4de46e52-60c3-49d9-be39-636681009789,2357c76e-9253-4ceb-b44e-ef4b71cb7d4d,226.17,2024-03-16 20:41:31,bank transfer,clothing,5,31,Lynnberg,desktop,207.208.171.73,"828 Strong Loaf Apt. 646\nNew Joshua, UT 84798","828 Strong Loaf Apt. 646\nNew Joshua, UT 84798",0,124,20
4,074a76de-fe2d-443e-a00c-f044cdb68e21,45071bc5-9588-43ea-8093-023caec8ea1c,121.53,2024-01-15 05:08:17,bank transfer,clothing,2,51,South Nicole,tablet,190.172.14.169,"29799 Jason Hills Apt. 439\nWest Richardtown, ...","29799 Jason Hills Apt. 439\nWest Richardtown, ...",0,158,5
5,4e707452-7c8a-4cbd-b0c1-2aeaa35c5e88,29616b04-2d5c-4729-9c9d-8d71a6ad9dc1,166.41,2024-01-30 10:55:14,bank transfer,toys & games,2,34,Herreramouth,tablet,202.237.29.55,"5699 Brittany Villages Suite 903\nLake Tim, MD...","120 Kristi Dale\nPort Meganshire, GU 03060",0,38,10
6,7ed952fe-8ae1-4f11-8cc5-6607060240d8,fe21ae29-ba4c-424f-9d55-0095539c09fa,92.88,2024-02-04 19:59:10,PayPal,toys & games,2,14,Ramosfort,tablet,13.45.27.192,"727 Gibson Islands Apt. 279\nNew Davidbury, ME...","727 Gibson Islands Apt. 279\nNew Davidbury, ME...",0,119,19
7,0b2fb5aa-7171-472f-8269-371094608a07,024257c3-5671-4de8-a33c-98fc5cbe6f92,318.14,2024-02-20 13:30:29,credit card,health & beauty,4,42,Port Emily,desktop,131.141.230.185,"3914 Davis Union\nBrownchester, IN 07744","3914 Davis Union\nBrownchester, IN 07744",0,251,13
8,1f52366c-7f40-4397-885f-3856b6e6531c,f17640ca-49da-45d1-8461-c2a1cf9c1b61,47.92,2024-03-03 19:44:00,bank transfer,home & garden,4,38,Carneyfurt,desktop,210.148.17.240,"47893 Maldonado Stream Suite 443\nBrownshire, ...","47893 Maldonado Stream Suite 443\nBrownshire, ...",0,190,19
9,3f10dfde-9c4c-4085-9872-4f6b39502ffb,aab93e75-582f-4455-80b4-1fb35733a47c,121.78,2024-01-16 21:19:39,bank transfer,health & beauty,4,39,Brockburgh,mobile,174.32.252.238,"2334 Briana Centers Suite 576\nArchershire, NM...","2334 Briana Centers Suite 576\nArchershire, NM...",0,343,21


In [6]:
from sklearn.feature_selection import RFE

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472952 entries, 0 to 1472951
Data columns (total 16 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Transaction ID      1472952 non-null  object 
 1   Customer ID         1472952 non-null  object 
 2   Transaction Amount  1472952 non-null  float64
 3   Transaction Date    1472952 non-null  object 
 4   Payment Method      1472952 non-null  object 
 5   Product Category    1472952 non-null  object 
 6   Quantity            1472952 non-null  int64  
 7   Customer Age        1472952 non-null  int64  
 8   Customer Location   1472952 non-null  object 
 9   Device Used         1472952 non-null  object 
 10  IP Address          1472952 non-null  object 
 11  Shipping Address    1472952 non-null  object 
 12  Billing Address     1472952 non-null  object 
 13  Is Fraudulent       1472952 non-null  int64  
 14  Account Age Days    1472952 non-null  int64  
 15  Transaction Hou

In [7]:
xgb_model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
    n_estimators=100,
    max_depth=6,
    scale_pos_weight=1,  # Adjust as needed for class imbalance
)


In [None]:
rfe = RFE(estimator=xgb_model, n_features_to_select=10)  # Select the top 10 features

In [None]:
rfe.fit(X_resampled, y_resampled)
selected_features = X_train.columns[rfe.support_]
print("Selected Features:", selected_features)

In [None]:
# Filter the dataset to only include the selected features
X_train_resampled_rfe = X_resampled[selected_features]
X_test_rfe = X_test[selected_features]

# Train the XGBoost model on the selected features
xgb_model.fit(X_train_resampled_rfe, y_resampled)

# Predict on the test set
y_pred = xgb_model.predict(X_test_rfe)
y_prob = xgb_model.predict_proba(X_test_rfe)[:, 1]

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
params = {
    'max_depth': [6, 10],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 100, 200],
    'scale_pos_weight': [1, 2, 5],
}

grid_search = GridSearchCV(xgb_model, param_grid=params, scoring='roc_auc', cv=3)
grid_search.fit(X_train_resampled_rfe, y_resampled)

print(grid_search.best_params_)

In [None]:
final_model = XGBClassifier(learning_rate= 0.1, max_depth= 10, n_estimators= 200, scale_pos_weight= 5)
final_model.fit(X_train_resampled_rfe, y_resampled)

In [None]:
# Make predictions
y_pred = final_model.predict(X_test_rfe)
y_pred_proba = final_model.predict_proba(X_test_rfe)[:, 1]  # For AUC

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
xgb_mod.fit(X_resampled, y_resampled, eval_set=[(X_test, y_test)], verbose=True)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
# Predictions
y_pred = xgb_mod.predict(X_test)
y_pred_proba = xgb_mod.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Classification metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

# AUC Score
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC Score: {auc}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)