In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import mlflow
import mlflow.sklearn

In [3]:
# Load datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_data = pd.read_csv('../data/IpAddress_to_Country.csv')
credit_card_data = pd.read_csv('../data/creditcard.csv')

In [4]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [6]:
# Ensure all IP addresses are strings and handle missing values
fraud_data['ip_address'] = fraud_data['ip_address'].astype(str)

# Apply IP conversion safely
fraud_data['ip_int'] = fraud_data['ip_address'].apply(lambda x: int(x.split('.')[-1]) if '.' in x else np.nan)

# Drop rows with NaN values in ip_int if necessary
fraud_data.dropna(subset=['ip_int'], inplace=True)

# Convert ip_int to integer type
fraud_data['ip_int'] = fraud_data['ip_int'].astype(int)


In [7]:
# Feature Engineering
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])
fraud_data['time_diff'] = (fraud_data['purchase_time'] - fraud_data['signup_time']).dt.total_seconds()
fraud_data['hour_of_day'] = fraud_data['purchase_time'].dt.hour
fraud_data['day_of_week'] = fraud_data['purchase_time'].dt.dayofweek

In [8]:

# Encode categorical features
encoder = LabelEncoder()
fraud_data['source'] = encoder.fit_transform(fraud_data['source'])
fraud_data['browser'] = encoder.fit_transform(fraud_data['browser'])
fraud_data['sex'] = encoder.fit_transform(fraud_data['sex'])

In [10]:
# Drop unnecessary columns
fraud_data = fraud_data.drop(['user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address', 'ip_int'], axis=1)

# Prepare Credit Card Data
X_credit = credit_card_data.drop(columns=['Class'])
y_credit = credit_card_data['Class']
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

In [11]:

# Prepare E-Commerce Data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

In [12]:
# Standardize Data
scaler = StandardScaler()
X_credit_train = scaler.fit_transform(X_credit_train)
X_credit_test = scaler.transform(X_credit_test)
X_fraud_train = scaler.fit_transform(X_fraud_train)
X_fraud_test = scaler.transform(X_fraud_test)

In [13]:
# Model Training and Evaluation
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    with mlflow.start_run():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.sklearn.log_model(model, model_name)
        print(f"{model_name} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}")
        return model

In [14]:
# Train Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP Classifier": MLPClassifier(max_iter=300)
}


In [None]:


for name, model in models.items():
    print(f"Training {name} on credit card data...")
    train_and_evaluate(model, X_credit_train, X_credit_test, y_credit_train, y_credit_test, f"{name}_Credit")
    print(f"Training {name} on fraud data...")
    train_and_evaluate(model, X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, f"{name}_Fraud")


Training Logistic Regression on credit card data...




Logistic Regression_Credit - Accuracy: 0.9991, ROC AUC: 0.7805
Training Logistic Regression on fraud data...




Logistic Regression_Fraud - Accuracy: 0.9057, ROC AUC: 0.5000
Training Decision Tree on credit card data...




Decision Tree_Credit - Accuracy: 0.9991, ROC AUC: 0.9079
Training Decision Tree on fraud data...




Decision Tree_Fraud - Accuracy: 0.9075, ROC AUC: 0.7575
Training Random Forest on credit card data...
