In [None]:
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('final_df_imputed.csv')
df.head(5)

In [None]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
file = 'final_df_imputed.csv'
# file_name = os.path.join(file_path, 'final_df_imputed.csv')

if os.path.exists(file): # Change it to file_name if using Collab and comment out file
    final_df = pd.read_csv('final_df_imputed.csv')
else:
    # Apply KNN imputation for numerical features
    imputer = KNNImputer(n_neighbors=5)
    final_df[numerical_cols] = imputer.fit_transform(final_df[numerical_cols])

    # Apply mode imputation for categorical features
    for col in categorical_cols:
        final_df[col] = final_df[col].fillna(final_df[col].mode()[0])

    # Check if there are any remaining missing values
    print(final_df.isnull().sum().sum())

    final_df.to_csv('final_df_imputed.csv', index=False)

In [None]:
final_df.describe()

In [None]:
final_df = final_df.drop('TransactionID', axis=1)
fraud = final_df[final_df["isFraud"] == 1]
notfraud = final_df[final_df["isFraud"] == 0]

In [None]:
fraud['dist1'].shape[0]/final_df.shape[0]

In [None]:
notfraud['dist1'].shape[0]/final_df.shape[0]

In [None]:
threshold = fraud['dist1'].mean() + fraud['dist1'].std()

In [None]:
X = final_df.drop(['dist1'], axis=1)
y = final_df['dist1']
X_numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.drop('isFraud')
X_categorical_cols = final_df.select_dtypes(include=['object']).columns
from sklearn.preprocessing import StandardScaler
X = pd.get_dummies(X, columns=X_categorical_cols, drop_first=True)

scaler = StandardScaler()
X[X_numerical_cols] = scaler.fit_transform(X[X_numerical_cols])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Linear Regression RMSE: {rmse_lr}')

In [None]:
import matplotlib.pyplot as plt
ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="b", label="Fitted Values" , ax=ax1)
plt.legend(["Actual Value", "Fitted Values"], loc="upper right")
plt.show()

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(range(len(y_test)), y_test, s=10, c='b', marker="s")
ax1.scatter(range(len(y_pred)), y_pred, s=10, c='r', marker="o")
plt.title('Actual Values vs Predicted Values from Linear Regression')
plt.legend(labels=['Actual Value', 'Predicted Value'], loc="upper right")
plt.show()

In [None]:
from sklearn.feature_selection import RFE

def rfe_evaluation(n, X_test, X_train, y_train):

    model = LinearRegression()

    # Initialize RFE with the model and number of features to select
    rfe = RFE(model, n_features_to_select=n)

    # Fit RFE
    fit = rfe.fit(X_train, y_train)

    # Get the selected features
    selected_features = X_train.columns[fit.support_]
    model.fit(X_train[selected_features], y_train)
    y_pred = model.predict(X_test[selected_features])
    rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
    return [fit.n_features_, selected_features, rmse_lr]

In [None]:
rfe_selected = []
for i in range(10,X.shape[1], 10):
   rfe_selected.append(rfe_evaluation(i, X_test, X_train, y_train))

In [None]:
x_rfe = []
y_rfe = []
print("   Evaluation from different numbers of selected features:")
print("----------------------------------------------------------------")
for i in range(len(rfe_selected)):
    x_rfe.append(rfe_selected[i][0])
    y_rfe.append(rfe_selected[i][2])
    print(f"Num of selected features:{rfe_selected[i][0]}   Linear RMSE: {rfe_selected[i][2]}")

In [None]:
plt.scatter(x_rfe, y_rfe)
plt.xlabel("Number of Features Selected")
plt.ylabel("RSME")
plt.title("Number of Features Selected vs RSME")
plt.show()

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBRegressor()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f'XGBoost RMSE: {rmse_xgb}')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(range(len(y_test)), y_test, s=10, c='b', marker="s")
ax1.scatter(range(len(y_pred_xgb)), y_pred_xgb, s=10, c='r', marker="o")
plt.title('Actual Values vs Predicted Values from XGB Regressor')
plt.legend(labels=['Actual Value', 'Predicted Value'], loc="upper right")
plt.show()

In [None]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
def xgb_evaluate(max_depth, learning_rate, n_estimators, subsample, colsample_bytree):
    params = {
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'objective': 'reg:squarederror'
    }
    model = xgb.XGBRegressor(**params)
    cv_result = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    return np.sqrt(-cv_result.mean())

# Define the parameter space
param_bounds = {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (100, 1000),
    'subsample': (0.6, 1.0),
    'colsample_bytree': (0.6, 1.0)
}

# Initialize Bayesian Optimization
optimizer = BayesianOptimization(f=xgb_evaluate, pbounds=param_bounds, random_state=42)

# Run the optimizer
optimizer.maximize(init_points=10, n_iter=10)

# Best parameters
print("Best Parameters: ", optimizer.max)
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])

# Train the final model
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

# Evaluate the final model
from sklearn.metrics import mean_squared_error

y_pred_xgbbo = final_model.predict(X_test)
rmse_xgbbo = np.sqrt(mean_squared_error(y_test, y_pred_xgbbo))
print(f'XGBoost(BayesianOptimization) RMSE: {rmse_xgbbo}')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(range(len(y_test)), y_test, s=10, c='b', marker="s")
ax1.scatter(range(len(y_pred_xgbbo)), y_pred_xgbbo, s=10, c='r', marker="o")
plt.title('Actual Value vs Predicted Value from XGB Regressor (BayesianOptimization)')
plt.legend(labels=['Actual Value', 'Predicted Value'], loc="upper right")
plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Train XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Define hyperparameter grid
param_dist = {
    'n_estimators': [1000, 2000, 3000],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Fitting training values to RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=3, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

# Best model
best_xgb_model = random_search.best_estimator_

# Predict and evaluate
y_pred_xgbr = best_xgb_model.predict(X_test)
rmse_xgbr = np.sqrt(mean_squared_error(y_test, y_pred_xgbr))
print(f'Final XGBoost(RandomizedSearchCV) RMSE: {rmse_xgbr}')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.scatter(range(len(y_test)), y_test, s=10, c='b', marker="s")
ax1.scatter(range(len(y_pred_xgbr)), y_pred_xgbr, s=10, c='r', marker="o")
plt.title('Actual Values vs Predicted Values from XGB Regressor (RandomizedSearchCV)')
plt.legend(labels=['Actual Value', 'Predicted Value'], loc="upper right")
plt.show()

In [None]:
def highrisk(X, y):
    df = X.join(y)
    df['risk_flag'] = df['dist1'].apply(lambda x: 1 if x > threshold else 0)
    high_risk_group = df[df['risk_flag'] == 1]
    fraud_proportion = high_risk_group['isFraud'].mean()
    return fraud_proportion


In [None]:
print(f'Proportion of fraudulent transactions in high-risk group:')
print("-----------------------------------------------------------------")
print(f"Entire data set: {highrisk(X, y)}")
print(f"Test set: {highrisk(X_test, y_test)}")
print(f'Linear Regression: {highrisk(X_test, pd.DataFrame(y_pred, index=X_test.index, columns= ["dist1"]))}')
print(f'XGBoost: {highrisk(X_test, pd.DataFrame(y_pred_xgb, index=X_test.index, columns= ["dist1"]))}')
print(f'XGBoost(BayesianOptimization): {highrisk(X_test, pd.DataFrame(y_pred_xgbbo, index=X_test.index, columns= ["dist1"]))}')
print(f'XGBBoost(RandomizedSearchCV): {highrisk(X_test,pd.DataFrame(y_pred_xgbr, index=X_test.index, columns= ["dist1"]))}')

In [None]:
print(f'RMSE Based on Model:')
print("-----------------------------------------------------------------")
print(f'Linear Regression: {rmse_lr}')
print(f'XGBoost: {rmse_xgb}')
print(f'XGBoost(BayesianOptimization): {rmse_xgbbo}')
print(f'XGBBoost(RandomizedSearchCV): {rmse_xgbr}')