In [30]:
!pip install xgboost imblearn



In [31]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import joblib
from io import StringIO
from xgboost import XGBClassifier
from sagemaker import get_execution_role
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler


# Load the dataset from S3
s3 = boto3.client('s3')
bucket = 'nimbleai-whizlabs'
object_key = 'fraudTrain.csv'

csv_obj =s3.get_object(Bucket=bucket, Key=object_key)
csv_string = csv_obj['Body'].read().decode('utf-8')
fraud_df = pd.read_csv(StringIO(csv_string))



In [32]:
# Convert date columns
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'])
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'])

# Feature engineering
fraud_df['trans_hour'] = fraud_df['trans_date_trans_time'].dt.hour
fraud_df['trans_day_of_week'] = fraud_df['trans_date_trans_time'].dt.dayofweek + 1
fraud_df['age'] = ((fraud_df['trans_date_trans_time'] - fraud_df['dob']).dt.days / 365.25).astype(int)

# Age categories
custom_bins = [13, 19, 32, 42, 50, 62, float('inf')]
custom_labels = ['Teenagers', 'Young Adults', 'Adults', 'Middle-aged', 'Seniors', 'Retired']
fraud_df['age_category'] = pd.cut(fraud_df['age'], bins=custom_bins, labels=custom_labels, right=False)

# Distance to merchant
fraud_df['distance_to_merchant'] = np.sqrt(
    (fraud_df['lat'] - fraud_df['merch_lat'])**2 + (fraud_df['long'] - fraud_df['merch_long'])**2
)

# Transactions per capita
fraud_df['transactions_per_capita'] = fraud_df.groupby('city_pop')['trans_num'].transform('count') / fraud_df['city_pop']

# One-hot encoding categorical variables
categorical_columns = ['category', 'gender', 'age_category']
fraud_df = pd.get_dummies(fraud_df, columns=categorical_columns, drop_first=True)

fraud_df.head()

Unnamed: 0,Index,trans_date_trans_time,cc_num,merchant,amt,first,last,street,city,state,...,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,age_category_Young Adults,age_category_Adults,age_category_Middle-aged,age_category_Seniors,age_category_Retired
0,0,2019-01-01 00:00:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",4.97,Jennifer,Banks,561 Perry Cove,Moravian Falls,NC,...,0,0,0,0,0,1,0,0,0,0
1,1,2019-01-01 00:00:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",107.23,Stephanie,Gill,43039 Riley Greens Suite 393,Orient,WA,...,0,0,0,0,0,0,1,0,0,0
2,2,2019-01-01 00:00:00,38859500000000.0,fraud_Lind-Buckridge,220.11,Edward,Sanchez,594 White Dale Suite 530,Malad City,ID,...,0,0,0,0,1,0,0,0,1,0
3,3,2019-01-01 00:01:00,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",45.0,Jeremy,White,9443 Cynthia Court Apt. 038,Boulder,MT,...,0,0,0,0,1,0,0,0,1,0
4,4,2019-01-01 00:03:00,375534000000000.0,fraud_Keeling-Crist,41.96,Tyler,Garcia,408 Bradley Rest,Doe Hill,VA,...,0,0,0,0,1,0,1,0,0,0


In [33]:
# Select features
available_columns = fraud_df.columns
selected_features = [
    'amt', 'city_pop', 'trans_hour', 'trans_day_of_week', 'age',
    'distance_to_merchant', 'transactions_per_capita'
]
selected_features += [col for col in available_columns if col.startswith('category_')]
selected_features += [col for col in available_columns if col.startswith('gender_')]
selected_features += [col for col in available_columns if col.startswith('age_category_')]


print(selected_features)


['amt', 'city_pop', 'trans_hour', 'trans_day_of_week', 'age', 'distance_to_merchant', 'transactions_per_capita', 'category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 'gender_M', 'age_category_Young Adults', 'age_category_Adults', 'age_category_Middle-aged', 'age_category_Seniors', 'age_category_Retired']


In [34]:
X = fraud_df[selected_features]
y = fraud_df['is_fraud']

# Scale numerical features
scaler = StandardScaler()
X[selected_features[:7]] = scaler.fit_transform(X[selected_features[:7]])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.2, random_state=42, k_neighbors=5)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Compute class weight
class_ratio = (y_res == 0).sum() / (y_res == 1).sum()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[selected_features[:7]] = scaler.fit_transform(X[selected_features[:7]])


In [35]:
from sklearn.datasets import dump_svmlight_file

# Save LIBSVM formatted training & validation data
dump_svmlight_file(X_res, y_res, "train_data.libsvm", zero_based=True)
dump_svmlight_file(X_test, y_test, "valid_data.libsvm", zero_based=True)

# Upload LIBSVM data to S3
s3_train_path = sagemaker_session.upload_data("train_data.libsvm", bucket=bucket, key_prefix=f"{prefix}/train")
s3_valid_path = sagemaker_session.upload_data("valid_data.libsvm", bucket=bucket, key_prefix=f"{prefix}/validation")


In [39]:
from sagemaker.estimator import Estimator

# Define SageMaker built-in XGBoost container
xgb_image = sagemaker.image_uris.retrieve("xgboost", region=sagemaker_session.boto_region_name, version="1.5-1")

# Define XGBoost estimator
xgb_estimator = Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",  
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker_session
)

print("Estimator initiated successfully!")

# Set hyperparameters
xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="logloss",
    num_round=300,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.2,
    scale_pos_weight=class_ratio
)


print("hyperparameters are set successfully!")

# Train the model
xgb_estimator.fit({"train": s3_train_path, "validation": s3_valid_path})


print("Model Trained successfully!")


Estimator initiated successfully!
hyperparameters are set successfully!


2025-01-30 05:42:35 Starting - Starting the training job...
2025-01-30 05:42:50 Starting - Preparing the instances for training...
2025-01-30 05:43:15 Downloading - Downloading input data...
2025-01-30 05:44:06 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-01-30 05:44:57.293 ip-10-0-229-203.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-01-30 05:44:57.314 ip-10-0-229-203.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-01-30:05:44:57:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-01-30:05:44:57:INFO] Failed to parse hyperparameter eval_metric value logloss to Json.[0m
[34mReturning the value itself[0m
[34m[2025-01-30:05:44:57:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-01-30:05:44:57:INFO] No GPUs detected (normal if no gpus 

In [40]:

# Deploy the trained model
xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="fraud-detection-model-endpoint"
)

print("Model deployed successfully!")

------!Model deployed successfully!


In [63]:
# Test Endpoint
import json
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(
    endpoint_name='fraud-detection-model-endpoint',
    sagemaker_session=sagemaker_session,
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

# Sample Prediction
sample_data = [281.06, 885, 1, 3, 30, 0.699298, 1.819209, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1 ,0, 0, 0, 0]
print(sample_data)
response = predictor.predict(sample_data)
print(f"Prediction Response: {response}")

score = response['predictions'][0].get('score')
probability = 1 / (1 + np.exp(-score))

print(f"Probability of positive class (fraud): {probability}")

[281.06, 885, 1, 3, 30, 0.699298, 1.819209, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
Prediction Response: {'predictions': [{'score': 0.005901785101741552}]}
Probability of positive class (fraud): 0.5014754419928362


In [None]:
['amt', 'city_pop', 'trans_hour', 'trans_day_of_week', 'age', 'distance_to_merchant', 'transactions_per_capita', 'category_food_dining', 'category_gas_transport', 'category_grocery_net', 'category_grocery_pos', 'category_health_fitness', 'category_home', 'category_kids_pets', 'category_misc_net', 'category_misc_pos', 'category_personal_care', 'category_shopping_net', 'category_shopping_pos', 'category_travel', 'gender_M', 'age_category_Young Adults', 'age_category_Adults', 'age_category_Middle-aged', 'age_category_Seniors', 'age_category_Retired']

In [62]:
pd.set_option('display.max_columns', None)

fraud_df[fraud_df['is_fraud'] == 1].head()


Unnamed: 0,Index,trans_date_trans_time,cc_num,merchant,amt,first,last,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,age,distance_to_merchant,transactions_per_capita,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,age_category_Young Adults,age_category_Adults,age_category_Middle-aged,age_category_Seniors,age_category_Retired
2449,2449,2019-01-02 01:06:00,4613310000000.0,fraud_Rutherford-Mertz,281.06,Jason,Murphy,542 Steve Curve Suite 011,Collettsville,NC,28611,35.9946,-81.7266,885,Soil scientist,1988-09-15,e8a81877ae9a0a7f883e15cb39dc4022,1325466397,36.430124,-81.179483,1,1,3,30,0.699298,1.819209,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2472,2472,2019-01-02 01:47:00,340187000000000.0,"fraud_Jenkins, Hauck and Friesen",11.52,Misty,Hart,27954 Hall Mill Suite 575,San Antonio,TX,78208,29.44,-98.459,1595797,Horticultural consultant,1960-10-28,bc7d41c41103877b03232f03f1f8d3f5,1325468849,29.819364,-99.142791,1,1,3,58,0.781976,0.002602,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2523,2523,2019-01-02 03:05:00,340187000000000.0,fraud_Goodwin-Nitzsche,276.31,Misty,Hart,27954 Hall Mill Suite 575,San Antonio,TX,78208,29.44,-98.459,1595797,Horticultural consultant,1960-10-28,b98f12f4168391b2203238813df5aa8c,1325473523,29.273085,-98.83636,1,3,3,58,0.412627,0.002602,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2546,2546,2019-01-02 03:38:00,4613310000000.0,fraud_Erdman-Kertzmann,7.03,Jason,Murphy,542 Steve Curve Suite 011,Collettsville,NC,28611,35.9946,-81.7266,885,Soil scientist,1988-09-15,397894a5c4c02e3c61c784001f0f14e4,1325475483,35.909292,-82.09101,1,3,3,30,0.374262,1.819209,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
2553,2553,2019-01-02 03:55:00,340187000000000.0,fraud_Koepp-Parker,275.73,Misty,Hart,27954 Hall Mill Suite 575,San Antonio,TX,78208,29.44,-98.459,1595797,Horticultural consultant,1960-10-28,7863235a750d73a244c07f1fb7f0185a,1325476547,29.786426,-98.68341,1,3,3,58,0.41276,0.002602,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
