In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import shutil

# Define paths
dataset_path = "/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv"
working_path = "/kaggle/working/PS_20174392719_1491204439457_log.csv"

# Copy dataset to working directory
shutil.copy(dataset_path, working_path)
print(f"✅ Dataset copied to: {working_path}")

# Load dataset
df = pd.read_csv(dataset_path)

# Display basic info
print("✅ Dataset Loaded Successfully!")
print(df.info())  # Check data types and missing values
print(df.head())  # Display first few rows


✅ Dataset copied to: /kaggle/working/PS_20174392719_1491204439457_log.csv
✅ Dataset Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C84008

In [2]:
# Check for missing values
missing_values = df.isnull().sum()
print("📌 Missing Values:\n", missing_values)

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"📌 Duplicate Rows: {duplicates}")

# Summary statistics
print("📌 Summary Statistics:\n", df.describe())


📌 Missing Values:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
📌 Duplicate Rows: 0
📌 Summary Statistics:
                step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04   1.420800e+04    0.000000e+00   
75%    3.350000e+02  2.087215e+05   1.073152e+05    1.442584e+05   
max    7.430000e+02  9.244552e+07   5.958504e+07    4.958504e+07   

       oldbalanceDest  newbalanceDest       isFraud  isFlaggedFraud  
count    6.362620e+06 

In [3]:
# Encode 'type' column
df['type_encoded'] = df['type'].astype('category').cat.codes

# Create 'isMerchant' feature (1 if destination starts with 'M', else 0)
df['isMerchant'] = df['nameDest'].apply(lambda x: 1 if x.startswith('M') else 0)

# Create 'amount_ratio' (to capture abnormal transaction behavior)
df['amount_ratio'] = df['amount'] / (df['amount'].max() + 1)

# Drop unnecessary columns
df = df.drop(columns=['nameOrig', 'nameDest', 'type', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'])

print("✅ Feature Engineering Complete! Here's the new dataset structure:")
print(df.head())


✅ Feature Engineering Complete! Here's the new dataset structure:
   step    amount  isFraud  isFlaggedFraud  type_encoded  isMerchant  \
0     1   9839.64        0               0             3           1   
1     1   1864.28        0               0             3           1   
2     1    181.00        1               0             4           0   
3     1    181.00        1               0             1           0   
4     1  11668.14        0               0             3           1   

   amount_ratio  
0      0.000106  
1      0.000020  
2      0.000002  
3      0.000002  
4      0.000126  


In [4]:
cleaned_path = "/kaggle/working/cleaned_fraud_data.csv"
df.to_csv(cleaned_path, index=False)
print(f"✅ Cleaned dataset saved at: {cleaned_path}")


✅ Cleaned dataset saved at: /kaggle/working/cleaned_fraud_data.csv


In [5]:
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
cleaned_path = "/kaggle/working/cleaned_fraud_data.csv"
df = pd.read_csv(cleaned_path)

# Define features & target
X = df.drop(columns=["isFraud"])  # Features
y = df["isFraud"]  # Target

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Save train/test sets for later use
X_train.to_csv("/kaggle/working/X_train.csv", index=False)
X_test.to_csv("/kaggle/working/X_test.csv", index=False)
y_train.to_csv("/kaggle/working/y_train.csv", index=False)
y_test.to_csv("/kaggle/working/y_test.csv", index=False)

print("✅ Data split complete! Training and testing sets saved.")


✅ Data split complete! Training and testing sets saved.


In [6]:
import xgboost as xgb
import joblib

# Initialize model
model = xgb.XGBClassifier(
    n_estimators=200, 
    max_depth=6, 
    learning_rate=0.1, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    use_label_encoder=False, 
    eval_metric="logloss"
)

# Train model
print("🚀 Training the XGBoost model...")
model.fit(X_train, y_train)
print("✅ Model training complete!")

# Save model
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
joblib.dump(model, model_path)
print(f"✅ Model saved at: {model_path}")


🚀 Training the XGBoost model...
✅ Model training complete!
✅ Model saved at: /kaggle/working/fraud_detection_xgboost.pkl


In [7]:
from sklearn.metrics import classification_report, accuracy_score

# Load test data
X_test = pd.read_csv("/kaggle/working/X_test.csv")
y_test = pd.read_csv("/kaggle/working/y_test.csv")

# Make predictions
y_pred = model.predict(X_test)

# Print evaluation metrics
print("📌 Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


📌 Model Performance:
Accuracy: 0.9989
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.89      0.18      0.30      1643

    accuracy                           1.00   1272524
   macro avg       0.94      0.59      0.65   1272524
weighted avg       1.00      1.00      1.00   1272524



In [8]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load cleaned data
df = pd.read_csv("/kaggle/working/cleaned_fraud_data.csv")

# Split features and labels
X = df.drop(columns=['isFraud'])  # Features
y = df['isFraud']  # Target

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Calculate scale_pos_weight
non_fraud = y_train.value_counts()[0]
fraud = y_train.value_counts()[1]
scale_pos_weight = non_fraud / fraud  # Balance fraud cases

# Train XGBoost with scale_pos_weight
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    scale_pos_weight=scale_pos_weight,  # Handles imbalance
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate Performance
print("📌 Model Performance with `scale_pos_weight`")
print(classification_report(y_test, y_pred))

# Save the updated model
import joblib
joblib.dump(model, "/kaggle/working/fraud_detection_xgboost.pkl")
print("✅ Updated model saved!")


📌 Model Performance with `scale_pos_weight`
              precision    recall  f1-score   support

           0       1.00      0.94      0.97   1270881
           1       0.02      0.88      0.04      1643

    accuracy                           0.94   1272524
   macro avg       0.51      0.91      0.50   1272524
weighted avg       1.00      0.94      0.97   1272524

✅ Updated model saved!


In [9]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Get model probabilities
y_probs = model.predict_proba(X_test)[:, 1]  # Get probability of fraud

# Find best threshold using Precision-Recall tradeoff
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

best_threshold = 0.80  # Adjust this manually

# Apply new threshold
y_pred_adjusted = (y_probs > best_threshold).astype(int)

# Evaluate model performance with adjusted threshold
print(f"📌 Model Performance with Adjusted Threshold ({best_threshold})")
print(classification_report(y_test, y_pred_adjusted))


📌 Model Performance with Adjusted Threshold (0.8)
              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1270881
           1       0.04      0.76      0.08      1643

    accuracy                           0.98   1272524
   macro avg       0.52      0.87      0.53   1272524
weighted avg       1.00      0.98      0.99   1272524



In [10]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Initialize SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Increase fraud cases to 50% of normal cases

# Apply SMOTE
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Check new class distribution
print(f"✅ Class distribution after SMOTE: {Counter(y_train_sm)}")


✅ Class distribution after SMOTE: Counter({0: 5083526, 1: 2541763})


In [11]:
# Train a new XGBoost model on the SMOTE-balanced data
model_smote = XGBClassifier(scale_pos_weight=1, random_state=42)
model_smote.fit(X_train_sm, y_train_sm)

# Save the new model
joblib.dump(model_smote, "/kaggle/working/fraud_detection_xgboost_smote.pkl")
print("✅ SMOTE model saved successfully!")


✅ SMOTE model saved successfully!


In [12]:
from sklearn.metrics import classification_report

# Make predictions
y_pred_smote = model_smote.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred_smote))


              precision    recall  f1-score   support

           0       1.00      0.97      0.98   1270881
           1       0.03      0.78      0.06      1643

    accuracy                           0.97   1272524
   macro avg       0.52      0.87      0.52   1272524
weighted avg       1.00      0.97      0.98   1272524



In [13]:
# Train XGBoost with adjusted scale_pos_weight
model_smote_tuned = XGBClassifier(scale_pos_weight=0.5, random_state=42)
model_smote_tuned.fit(X_train_sm, y_train_sm)

# Save the improved model
joblib.dump(model_smote_tuned, "/kaggle/working/fraud_detection_xgboost_tuned.pkl")
print("✅ Tuned SMOTE model saved successfully!")


✅ Tuned SMOTE model saved successfully!


In [14]:
# Make predictions
y_pred_tuned = model_smote_tuned.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred_tuned))


              precision    recall  f1-score   support

           0       1.00      0.98      0.99   1270881
           1       0.05      0.71      0.10      1643

    accuracy                           0.98   1272524
   macro avg       0.53      0.85      0.54   1272524
weighted avg       1.00      0.98      0.99   1272524



In [15]:
# Train XGBoost with new scale_pos_weight
model_tuned = XGBClassifier(scale_pos_weight=0.3, random_state=42)
model_tuned.fit(X_train_sm, y_train_sm)

# Save the improved model
joblib.dump(model_tuned, "/kaggle/working/fraud_detection_xgboost_final.pkl")
print("✅ Further tuned model saved successfully!")


✅ Further tuned model saved successfully!


In [16]:
# Predict probabilities
y_probs = model_tuned.predict_proba(X_test)[:, 1]  # Get fraud probabilities

# Set a higher threshold (e.g., 0.7) for flagging fraud
y_pred_thresholded = (y_probs > 0.7).astype(int)

# Evaluate the new predictions
print(classification_report(y_test, y_pred_thresholded))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.17      0.56      0.26      1643

    accuracy                           1.00   1272524
   macro avg       0.58      0.78      0.63   1272524
weighted avg       1.00      1.00      1.00   1272524



In [17]:
# Predict probabilities
y_probs = model_tuned.predict_proba(X_test)[:, 1]  # Get fraud probabilities

# Set an even higher threshold for fraud detection
y_pred_thresholded = (y_probs > 0.85).astype(int)

# Evaluate again
print(classification_report(y_test, y_pred_thresholded))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.39      0.44      0.41      1643

    accuracy                           1.00   1272524
   macro avg       0.69      0.72      0.71   1272524
weighted avg       1.00      1.00      1.00   1272524



In [18]:
# Adjusting threshold to improve fraud precision
y_probs = model_tuned.predict_proba(X_test)[:, 1]  
y_pred_thresholded = (y_probs > 0.9).astype(int)  # Adjust threshold

# Evaluate the model again
print(classification_report(y_test, y_pred_thresholded))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.53      0.38      0.44      1643

    accuracy                           1.00   1272524
   macro avg       0.76      0.69      0.72   1272524
weighted avg       1.00      1.00      1.00   1272524



In [19]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the trained model
MODEL_PATH = "/kaggle/working/fraud_detection_xgboost.pkl"

try:
    model = joblib.load(MODEL_PATH)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")

# Load test data
X_test_path = "/kaggle/working/X_test.csv"
y_test_path = "/kaggle/working/y_test.csv"

try:
    X_test = pd.read_csv(X_test_path)
    y_test = pd.read_csv(y_test_path)
    print("✅ Test Data Loaded Successfully!")
except Exception as e:
    print(f"❌ Error loading test data: {e}")

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"📌 Accuracy: {accuracy:.4f}")

print("\n📌 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n📌 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


✅ Model Loaded Successfully!
✅ Test Data Loaded Successfully!
📌 Accuracy: 0.9407

📌 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97   1270881
           1       0.02      0.88      0.04      1643

    accuracy                           0.94   1272524
   macro avg       0.51      0.91      0.50   1272524
weighted avg       1.00      0.94      0.97   1272524


📌 Confusion Matrix:
[[1195639   75242]
 [    203    1440]]


In [20]:
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Get fraud probability scores
threshold = 0.3  # Adjust this value between 0.2 - 0.4
y_pred = (y_pred_proba > threshold).astype(int)  # Apply new threshold

# Evaluate new performance
print(classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


              precision    recall  f1-score   support

           0       1.00      0.89      0.94   1270881
           1       0.01      0.93      0.02      1643

    accuracy                           0.89   1272524
   macro avg       0.51      0.91      0.48   1272524
weighted avg       1.00      0.89      0.94   1272524

[[1134385  136496]
 [    118    1525]]


In [21]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(sampling_strategy=0.5, random_state=42)  # 50% fraud cases
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Check new class distribution
from collections import Counter
print("✅ Resampled class distribution:", Counter(y_resampled))


✅ Resampled class distribution: Counter({0: 4855876, 1: 2389405})


In [22]:
from xgboost import XGBClassifier

# Define the XGBoost model with balanced class weight
model = XGBClassifier(
    objective="binary:logistic",
    scale_pos_weight=len(y_resampled[y_resampled == 0]) / len(y_resampled[y_resampled == 1]),  # Adjust for new balance
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

# Train the model on the resampled data
model.fit(X_resampled, y_resampled)

# Save the model
import joblib
joblib.dump(model, "/kaggle/working/fraud_detection_xgboost.pkl")

print("✅ Model retrained and saved successfully!")


✅ Model retrained and saved successfully!


In [23]:
# Load test data
y_pred = model.predict(X_test)

# Print model performance
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(f"📌 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\n📌 Classification Report:\n", classification_report(y_test, y_pred))

print("\n📌 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


📌 Accuracy: 0.9466

📌 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97   1270881
           1       0.02      0.83      0.04      1643

    accuracy                           0.95   1272524
   macro avg       0.51      0.89      0.51   1272524
weighted avg       1.00      0.95      0.97   1272524


📌 Confusion Matrix:
[[1203179   67702]
 [    273    1370]]


In [24]:
import pandas as pd

# Load the cleaned dataset
cleaned_data_path = "/kaggle/working/cleaned_fraud_data.csv"

try:
    df = pd.read_csv(cleaned_data_path)
    print("✅ Cleaned Dataset Loaded Successfully!")
except Exception as e:
    print(f"❌ Error Loading Cleaned Dataset: {e}")
    
# Show the first few rows
df.head()


✅ Cleaned Dataset Loaded Successfully!


Unnamed: 0,step,amount,isFraud,isFlaggedFraud,type_encoded,isMerchant,amount_ratio
0,1,9839.64,0,0,3,1,0.000106
1,1,1864.28,0,0,3,1,2e-05
2,1,181.0,1,0,4,0,2e-06
3,1,181.0,1,0,1,0,2e-06
4,1,11668.14,0,0,3,1,0.000126


In [25]:
# Required features for training
required_features = ['step', 'amount', 'isFlaggedFraud', 'isMerchant', 'amount_ratio', 'type_encoded', 'isFraud']

# Ensure dataset contains the correct features
df = df[required_features]
print("✅ Feature Engineering Verified - Correct Features Retained!")


✅ Feature Engineering Verified - Correct Features Retained!


In [26]:
from sklearn.model_selection import train_test_split

# Split into features (X) and target (y)
X = df.drop(columns=['isFraud'])
y = df['isFraud']

# Split into train-test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("✅ Data Split into Training and Test Sets Successfully!")


✅ Data Split into Training and Test Sets Successfully!


In [27]:
from imblearn.combine import SMOTEENN
from collections import Counter

# Apply hybrid resampling
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Print new class distribution
print("✅ Class distribution after Hybrid Resampling:", Counter(y_resampled))


✅ Class distribution after Hybrid Resampling: Counter({1: 4955428, 0: 4841492})


In [28]:
import joblib

model_path = "/kaggle/working/fraud_detection_xgboost.pkl"

joblib.dump(model, model_path)
print(f"✅ Model Saved at: {model_path}")


✅ Model Saved at: /kaggle/working/fraud_detection_xgboost.pkl


In [29]:
import joblib

# Load the trained model
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"

try:
    model = joblib.load(model_path)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    print(f"❌ Error Loading Model: {e}")


✅ Model Loaded Successfully!


In [30]:
import pandas as pd

# Load the cleaned dataset
test_data_path = "/kaggle/working/cleaned_fraud_data.csv"

try:
    df = pd.read_csv(test_data_path)
    print("✅ Test Data Loaded Successfully!")
except Exception as e:
    print(f"❌ Error Loading Test Data: {e}")
    
# Show the first few rows
df.head()


✅ Test Data Loaded Successfully!


Unnamed: 0,step,amount,isFraud,isFlaggedFraud,type_encoded,isMerchant,amount_ratio
0,1,9839.64,0,0,3,1,0.000106
1,1,1864.28,0,0,3,1,2e-05
2,1,181.0,1,0,4,0,2e-06
3,1,181.0,1,0,1,0,2e-06
4,1,11668.14,0,0,3,1,0.000126


In [31]:
# Define feature columns (same as those used in training)
feature_columns = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Extract test features (X) and target labels (y)
X_test = df[feature_columns]
y_test = df["isFraud"]

print("✅ Test Data Prepared for Model Evaluation!")


✅ Test Data Prepared for Model Evaluation!


In [32]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Get predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


ValueError: feature_names mismatch: ['step', 'amount', 'isFlaggedFraud', 'type_encoded', 'isMerchant', 'amount_ratio'] ['step', 'amount', 'isFlaggedFraud', 'isMerchant', 'amount_ratio', 'type_encoded']

In [33]:
# Ensure feature columns are in the exact order used during training
expected_features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Reorder X_test columns to match model's training order
X_test = X_test[expected_features]

# Run model prediction again
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


ValueError: feature_names mismatch: ['step', 'amount', 'isFlaggedFraud', 'type_encoded', 'isMerchant', 'amount_ratio'] ['step', 'amount', 'isFlaggedFraud', 'isMerchant', 'amount_ratio', 'type_encoded']

In [34]:
# Get feature names from trained model
expected_features = model.feature_names_in_  # Extracts the feature order model expects

# Ensure X_test follows the same column order
X_test = X_test[expected_features]  

# Run model prediction
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


📌 Accuracy: 0.9468

📌 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97   6354407
           1       0.02      0.86      0.04      8213

    accuracy                           0.95   6362620
   macro avg       0.51      0.90      0.51   6362620
weighted avg       1.00      0.95      0.97   6362620


📌 Confusion Matrix:
 [[6016755  337652]
 [   1138    7075]]


In [35]:
threshold = 0.7  # Adjust this value based on experiments
y_probs = model.predict_proba(X_test)[:, 1]  # Get fraud probabilities
y_pred_thresholded = (y_probs >= threshold).astype(int)  # Apply threshold

# Recalculate performance
accuracy = accuracy_score(y_test, y_pred_thresholded)
report = classification_report(y_test, y_pred_thresholded)
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


📌 Accuracy: 0.9720

📌 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99   6354407
           1       0.04      0.80      0.07      8213

    accuracy                           0.97   6362620
   macro avg       0.52      0.88      0.53   6362620
weighted avg       1.00      0.97      0.98   6362620


📌 Confusion Matrix:
 [[6178020  176387]
 [   1668    6545]]


In [36]:
threshold = 0.75  # Adjust this threshold upwards
y_probs = model.predict_proba(X_test)[:, 1]  
y_pred_thresholded = (y_probs >= threshold).astype(int)  

# Recalculate performance
accuracy = accuracy_score(y_test, y_pred_thresholded)
report = classification_report(y_test, y_pred_thresholded)
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


📌 Accuracy: 0.9763

📌 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99   6354407
           1       0.04      0.78      0.08      8213

    accuracy                           0.98   6362620
   macro avg       0.52      0.88      0.53   6362620
weighted avg       1.00      0.98      0.99   6362620


📌 Confusion Matrix:
 [[6205434  148973]
 [   1790    6423]]


In [37]:
# Adjust scale_pos_weight dynamically
from xgboost import XGBClassifier

# Calculate the imbalance ratio
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)  # Ratio of non-fraud to fraud cases

# Train the model with class-weighted learning
model = XGBClassifier(
    scale_pos_weight=scale_pos_weight,  
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False
)

# Fit the model
model.fit(X_train, y_train)

# Predict with the new model
y_probs = model.predict_proba(X_test)[:, 1]  # Get probability scores
y_pred_thresholded = (y_probs >= 0.75).astype(int)  # Apply the threshold

# Recalculate performance
accuracy = accuracy_score(y_test, y_pred_thresholded)
report = classification_report(y_test, y_pred_thresholded)
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


ValueError: feature_names mismatch: ['step', 'amount', 'isFlaggedFraud', 'isMerchant', 'amount_ratio', 'type_encoded'] ['step', 'amount', 'isFlaggedFraud', 'type_encoded', 'isMerchant', 'amount_ratio']

In [38]:
# Ensure test set has the same column order as training set
X_test = X_test[X_train.columns]

# Predict with the new model
y_probs = model.predict_proba(X_test)[:, 1]  # Get probability scores
y_pred_thresholded = (y_probs >= 0.75).astype(int)  # Apply the threshold

# Recalculate performance
accuracy = accuracy_score(y_test, y_pred_thresholded)
report = classification_report(y_test, y_pred_thresholded)
conf_matrix = confusion_matrix(y_test, y_pred_thresholded)

print(f"📌 Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", report)
print("\n📌 Confusion Matrix:\n", conf_matrix)


📌 Accuracy: 0.9682

📌 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98   6354407
           1       0.03      0.80      0.06      8213

    accuracy                           0.97   6362620
   macro avg       0.52      0.89      0.52   6362620
weighted avg       1.00      0.97      0.98   6362620


📌 Confusion Matrix:
 [[6153875  200532]
 [   1608    6605]]


In [39]:
from sklearn.metrics import precision_recall_curve
import numpy as np

# Get probability scores
y_probs = model.predict_proba(X_test)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Find the best threshold (maximize F1-score)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"✅ Best Probability Threshold: {best_threshold:.4f}")

# Apply new threshold
y_pred_thresholded = (y_probs >= best_threshold).astype(int)

# Evaluate model
print("\n📌 Updated Classification Report:")
print(classification_report(y_test, y_pred_thresholded))

# Confusion Matrix
print("\n📌 Updated Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_thresholded))


✅ Best Probability Threshold: 0.9833

📌 Updated Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   6354407
           1       0.42      0.27      0.33      8213

    accuracy                           1.00   6362620
   macro avg       0.71      0.63      0.66   6362620
weighted avg       1.00      1.00      1.00   6362620


📌 Updated Confusion Matrix:
[[6351391    3016]
 [   5993    2220]]


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Use XGBoost probability scores as new features
X_train_xgb = model.predict_proba(X_train)[:, 1].reshape(-1, 1)
X_test_xgb = model.predict_proba(X_test)[:, 1].reshape(-1, 1)

# Train Logistic Regression model on XGBoost's predictions
log_model = LogisticRegression()
log_model.fit(X_train_xgb, y_train)

# Predict using Logistic Regression
y_pred_log = log_model.predict(X_test_xgb)

# Evaluate the ensemble model
print("\n📌 Ensemble Model Performance:")
print(classification_report(y_test, y_pred_log))

# Confusion Matrix
print("\n📌 Ensemble Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log))



📌 Ensemble Model Performance:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   6354407
           1       0.00      0.00      0.00      8213

    accuracy                           1.00   6362620
   macro avg       0.50      0.50      0.50   6362620
weighted avg       1.00      1.00      1.00   6362620


📌 Ensemble Confusion Matrix:
[[6354407       0]
 [   8213       0]]


In [41]:
# Re-train Logistic Regression with class weights
log_model = LogisticRegression(class_weight="balanced")
log_model.fit(X_train_xgb, y_train)

# Predict using the new weighted model
y_pred_log_weighted = log_model.predict(X_test_xgb)

# Evaluate again
print("\n📌 Updated Ensemble Model Performance:")
print(classification_report(y_test, y_pred_log_weighted))

# Confusion Matrix
print("\n📌 Updated Ensemble Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_weighted))



📌 Updated Ensemble Model Performance:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95   6354407
           1       0.01      0.92      0.03      8213

    accuracy                           0.91   6362620
   macro avg       0.51      0.92      0.49   6362620
weighted avg       1.00      0.91      0.95   6362620


📌 Updated Ensemble Confusion Matrix:
[[5792117  562290]
 [    668    7545]]


In [42]:
# Get fraud probabilities
y_probs_log = log_model.predict_proba(X_test_xgb)[:, 1]

# Adjust probability threshold
best_threshold = 0.95  # Increase this to make fraud detection more precise
y_pred_adjusted = (y_probs_log >= best_threshold).astype(int)

# Evaluate the model again
print("\n📌 Final Optimized Ensemble Model Performance:")
print(classification_report(y_test, y_pred_adjusted))

# Confusion Matrix
print("\n📌 Final Optimized Ensemble Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_adjusted))



📌 Final Optimized Ensemble Model Performance:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   6354407
           1       0.06      0.68      0.11      8213

    accuracy                           0.99   6362620
   macro avg       0.53      0.83      0.55   6362620
weighted avg       1.00      0.99      0.99   6362620


📌 Final Optimized Ensemble Confusion Matrix:
[[6266819   87588]
 [   2606    5607]]


In [43]:
import os
import glob

# Directory where models are saved
model_dir = "/kaggle/working/"
model_pattern = os.path.join(model_dir, "*.pkl")

# Get all model files
model_files = glob.glob(model_pattern)

# Sort files by last modified time
model_files.sort(key=os.path.getmtime, reverse=True)

# Keep the latest model, delete the rest
if len(model_files) > 1:
    latest_model = model_files[0]  # Most recent file
    for model in model_files[1:]:
        os.remove(model)  # Delete old models
        print(f"🗑️ Deleted: {model}")

    print(f"✅ Latest model kept: {latest_model}")
else:
    print("⚠️ Only one model found, no deletion needed.")


🗑️ Deleted: /kaggle/working/fraud_detection_xgboost_final.pkl
🗑️ Deleted: /kaggle/working/fraud_detection_xgboost_tuned.pkl
🗑️ Deleted: /kaggle/working/fraud_detection_xgboost_smote.pkl
✅ Latest model kept: /kaggle/working/fraud_detection_xgboost.pkl


In [44]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import pickle
import os
import datetime

# Initialize FastAPI App
app = FastAPI()

# Identify the latest trained model in /kaggle/working/
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
if not os.path.exists(model_path):
    raise Exception("❌ Model file not found!")

# Load the model
try:
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")

# Features used in the model
FEATURES = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Initialize SQLite database
DB_PATH = "/kaggle/working/fraud_logs.db"

def init_db():
    """Initialize SQLite database and create table if not exists"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS fraud_predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp TEXT,
            input_data TEXT,
            prediction INTEGER
        )
    """)
    conn.commit()
    conn.close()

# Initialize database
init_db()

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is Running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([data])
        
        # Ensure correct feature columns
        df = df[FEATURES]

        # Make prediction
        prediction = model.predict(df)[0]

        # Log prediction to database
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO fraud_predictions (timestamp, input_data, prediction) VALUES (?, ?, ?)",
            (str(datetime.datetime.now()), str(data), int(prediction)),
        )
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction)}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run API as a subprocess
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


ModuleNotFoundError: No module named 'fastapi'

In [45]:
!pip install fastapi uvicorn


Collecting fastapi
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.1-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.11-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.1-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, starlette, fastapi
Successfully installed fastapi-0.115.11 starlette-0.46.1 uvicorn-0.34.0


In [46]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import pickle
import os
import datetime

# Initialize FastAPI App
app = FastAPI()

# Identify the latest trained model in /kaggle/working/
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
if not os.path.exists(model_path):
    raise Exception("❌ Model file not found!")

# Load the model
try:
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")

# Features used in the model
FEATURES = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Initialize SQLite database
DB_PATH = "/kaggle/working/fraud_logs.db"

def init_db():
    """Initialize SQLite database and create table if not exists"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS fraud_predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp TEXT,
            input_data TEXT,
            prediction INTEGER
        )
    """)
    conn.commit()
    conn.close()

# Initialize database
init_db()

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is Running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([data])
        
        # Ensure correct feature columns
        df = df[FEATURES]

        # Make prediction
        prediction = model.predict(df)[0]

        # Log prediction to database
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO fraud_predictions (timestamp, input_data, prediction) VALUES (?, ?, ?)",
            (str(datetime.datetime.now()), str(data), int(prediction)),
        )
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction)}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run API as a subprocess
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


✅ Model Loaded Successfully!


RuntimeError: asyncio.run() cannot be called from a running event loop

In [47]:
import subprocess

# Start FastAPI in the background as a subprocess
subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"])
print("✅ API is running in the background on port 8000!")


✅ API is running in the background on port 8000!


In [48]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import pickle
import os
import datetime

# Initialize FastAPI App
app = FastAPI()

# Identify the latest trained model in /kaggle/working/
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
if not os.path.exists(model_path):
    raise Exception("❌ Model file not found!")

# Load the model
try:
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")

# Features used in the model
FEATURES = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Initialize SQLite database
DB_PATH = "/kaggle/working/fraud_logs.db"

def init_db():
    """Initialize SQLite database and create table if not exists"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS fraud_predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp TEXT,
            input_data TEXT,
            prediction INTEGER
        )
    """)
    conn.commit()
    conn.close()

# Initialize database
init_db()

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is Running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([data])
        
        # Ensure correct feature columns
        df = df[FEATURES]

        # Make prediction
        prediction = model.predict(df)[0]

        # Log prediction to database
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO fraud_predictions (timestamp, input_data, prediction) VALUES (?, ?, ?)",
            (str(datetime.datetime.now()), str(data), int(prediction)),
        )
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction)}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run API as a subprocess
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


✅ Model Loaded Successfully!


RuntimeError: asyncio.run() cannot be called from a running event loop

In [49]:
!pkill -9 uvicorn


In [50]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import pickle
import os
import datetime

# Initialize FastAPI App
app = FastAPI()

# Identify the latest trained model in /kaggle/working/
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
if not os.path.exists(model_path):
    raise Exception("❌ Model file not found!")

# Load the model
try:
    with open(model_path, "rb") as file:
        model = pickle.load(file)
    print("✅ Model Loaded Successfully!")
except Exception as e:
    raise HTTPException(status_code=500, detail=f"Model loading failed: {str(e)}")

# Features used in the model
FEATURES = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

# Initialize SQLite database
DB_PATH = "/kaggle/working/fraud_logs.db"

def init_db():
    """Initialize SQLite database and create table if not exists"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS fraud_predictions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp TEXT,
            input_data TEXT,
            prediction INTEGER
        )
    """)
    conn.commit()
    conn.close()

# Initialize database
init_db()

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is Running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([data])
        
        # Ensure correct feature columns
        df = df[FEATURES]

        # Make prediction
        prediction = model.predict(df)[0]

        # Log prediction to database
        conn = sqlite3.connect(DB_PATH)
        cursor = conn.cursor()
        cursor.execute(
            "INSERT INTO fraud_predictions (timestamp, input_data, prediction) VALUES (?, ?, ?)",
            (str(datetime.datetime.now()), str(data), int(prediction)),
        )
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction)}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run API as a subprocess
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


✅ Model Loaded Successfully!


RuntimeError: asyncio.run() cannot be called from a running event loop

In [51]:
import subprocess

# Stop any previous API instances running
!pkill -9 uvicorn

# Start FastAPI in a subprocess
process = subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

print("✅ FastAPI is running in the background on port 8000!")


✅ FastAPI is running in the background on port 8000!


In [52]:
import requests
response = requests.get("http://0.0.0.0:8000/")
print(response.json())  # Should return: {'message': 'Fraud Detection API is running!'}


  __new__ = eval(code, namespace)


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8000): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ccc02e97850>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [53]:
import os
os.system("pkill -9 uvicorn")
print("✅ Killed any existing Uvicorn process.")


✅ Killed any existing Uvicorn process.


In [54]:
import subprocess
import time

# Start FastAPI as a subprocess
process = subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Wait a few seconds to ensure API starts properly
time.sleep(5)

print("✅ FastAPI is running in the background on port 8000!")


✅ FastAPI is running in the background on port 8000!


In [55]:
import requests

try:
    response = requests.get("http://0.0.0.0:8000/")
    print("✅ API Response:", response.json())
except requests.ConnectionError:
    print("❌ API is still not running!")


❌ API is still not running!


In [56]:
import os
import shutil

# Define paths
working_dir = "/kaggle/working/"
api_path = os.path.join(working_dir, "final_api.py")
model_path = os.path.join(working_dir, "fraud_detection_xgboost.pkl")

# Remove API & Model if they exist
if os.path.exists(api_path):
    os.remove(api_path)
    print("✅ Deleted old API file.")

if os.path.exists(model_path):
    os.remove(model_path)
    print("✅ Deleted old model file.")

# Remove old database files
db_path = os.path.join(working_dir, "test_logs.db")
if os.path.exists(db_path):
    os.remove(db_path)
    print("✅ Deleted old database file.")

print("🔥 Hard reset complete. Recreating everything from scratch.")


✅ Deleted old model file.
🔥 Hard reset complete. Recreating everything from scratch.


In [57]:
api_code = """from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import pickle
import os

# Load Model
MODEL_PATH = "/kaggle/working/fraud_detection_xgboost.pkl"
try:
    with open(MODEL_PATH, "rb") as file:
        model = pickle.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Database Connection
DB_PATH = "/kaggle/working/test_logs.db"
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
conn.commit()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Save to database
        c.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
"""

with open("/kaggle/working/final_api.py", "w") as f:
    f.write(api_code)

print("✅ `final_api.py` saved successfully!")


✅ `final_api.py` saved successfully!


In [58]:
import shutil

# Define model paths
latest_model_path = "/kaggle/input/latest-model/fraud_detection_xgboost.pkl"  # Adjust the path if necessary
new_model_path = "/kaggle/working/fraud_detection_xgboost.pkl"

# Move the latest model
shutil.copy(latest_model_path, new_model_path)
print("✅ Model moved successfully to working directory.")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/latest-model/fraud_detection_xgboost.pkl'

In [59]:
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load the cleaned dataset
data_path = "/kaggle/working/cleaned_fraud_data.csv"
df = pd.read_csv(data_path)

# Define features & target
features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]
target = "isFraud"

X = df[features]
y = df[target]

# Apply SMOTE for balancing the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", scale_pos_weight=5, n_estimators=200)
model.fit(X_train, y_train)

# Save the model
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
joblib.dump(model, model_path)

print("✅ Model trained and saved successfully!")


✅ Model trained and saved successfully!


In [64]:
import shutil

# Define model paths
latest_model_path = "/kaggle/input/latest-model/fraud_detection_xgboost.pkl"
new_model_path = "/kaggle/working/fraud_detection_xgboost.pkl"

# Move the latest model
shutil.copy(latest_model_path, new_model_path)
print("✅ Model moved successfully to working directory.")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/latest-model/fraud_detection_xgboost.pkl'

In [65]:
api_code = """from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import joblib
import os

# Load Model
MODEL_PATH = "/kaggle/working/fraud_detection_xgboost.pkl"
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Database Connection
DB_PATH = "/kaggle/working/test_logs.db"
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
conn.commit()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Save to database
        c.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
"""

# Save API file
with open("/kaggle/working/final_api.py", "w") as f:
    f.write(api_code)

print("✅ `final_api.py` saved successfully!")


✅ `final_api.py` saved successfully!


In [66]:
import subprocess
import time

# Start FastAPI as a subprocess
process = subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Wait for API to initialize
time.sleep(5)

print("🚀 API is running in the background on port 8000!")


🚀 API is running in the background on port 8000!


In [67]:
import requests

try:
    response = requests.get("http://0.0.0.0:8000/")
    print("✅ API Response:", response.json())
except requests.ConnectionError:
    print("❌ API is still not running!")


✅ API Response: {'message': 'Fraud Detection API is running!'}


In [68]:
import requests

# Define a sample transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request
response = requests.post("http://0.0.0.0:8000/predict/", json=test_transaction)

# Print the response
print("Response:", response.json())


Response: {'detail': 'Prediction error: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 134714465132544 and this is thread id 134713273837120.'}


In [70]:
import sqlite3

@app.post("/predict/")
def predict(data: dict):
    try:
        # Create a SQLite connection inside the request function
        conn = sqlite3.connect("/path/to/logs.db")
        cursor = conn.cursor()
        
        # ... rest of your code that interacts with the database ...
        
        # Close the connection before returning
        conn.close()
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")


In [71]:
import requests

# Define a sample transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request
response = requests.post("http://0.0.0.0:8000/predict/", json=test_transaction)

# Print the response
print("Response:", response.json())


Response: {'detail': 'Prediction error: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 134714465132544 and this is thread id 134713273837120.'}


In [72]:
import sqlite3

@app.post("/predict/")
def predict(data: dict):
    try:
        # Open a connection within the function
        conn = sqlite3.connect("/kaggle/working/logs.db")
        cursor = conn.cursor()

        # Perform database operations
        cursor.execute("INSERT INTO logs (column1, column2) VALUES (?, ?)", (value1, value2))
        conn.commit()

        # Close the connection before returning
        conn.close()

        return {"prediction": "some_prediction"}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")


In [73]:
import sqlite3

# Run this once before starting the API
conn = sqlite3.connect("/kaggle/working/logs.db")
cursor = conn.cursor()
cursor.execute("""
    CREATE TABLE IF NOT EXISTS logs (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        column1 TEXT,
        column2 TEXT
    )
""")
conn.commit()
conn.close()


In [74]:
value1 = data.get("key1")  # Replace "key1" with the appropriate JSON key
value2 = data.get("key2")  # Replace "key2" with the appropriate JSON key


NameError: name 'data' is not defined

In [75]:
@app.post("/predict/")
def predict(data: dict):
    try:
        # Now 'data' is coming from the request body
        value1 = data.get("key1")  # Make sure 'key1' matches a key in the incoming JSON
        value2 = data.get("key2")  # Similarly, 'key2' should match another key
        # Further code follows...


SyntaxError: incomplete input (<ipython-input-75-a91810a244a2>, line 7)

In [76]:
import sqlite3
from fastapi import FastAPI, HTTPException

app = FastAPI()

@app.post("/predict/")
def predict(data: dict):
    try:
        # Open a new SQLite connection for this request
        conn = sqlite3.connect("/kaggle/working/logs.db")
        cursor = conn.cursor()

        # Example database operation
        cursor.execute("INSERT INTO logs (column1, column2) VALUES (?, ?)", (data.get("step"), data.get("amount")))
        conn.commit()

        # Close the connection before returning a response
        conn.close()

        return {"prediction": "some_prediction"}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")


In [77]:
import requests

try:
    response = requests.get("http://0.0.0.0:8000/")
    print("✅ API Response:", response.json())
except requests.ConnectionError:
    print("❌ API is still not running!")


✅ API Response: {'message': 'Fraud Detection API is running!'}


In [78]:
import requests

# Define a sample transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request
response = requests.post("http://0.0.0.0:8000/predict/", json=test_transaction)

# Print the response
print("Response:", response.json())


Response: {'detail': 'Prediction error: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 134714465132544 and this is thread id 134713273837120.'}


In [79]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import joblib
import os

# Load Model
MODEL_PATH = "/kaggle/working/fraud_detection_xgboost.pkl"
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Define the path to the database
DB_PATH = "/kaggle/working/test_logs.db"

@app.on_event("startup")
def initialize_database():
    # Create the database table at startup if it doesn't exist
    with sqlite3.connect(DB_PATH) as conn:
        cursor = conn.cursor()
        cursor.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
        conn.commit()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Use a fresh connection for database operations
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()
            cursor.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
            conn.commit()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)


        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


✅ Model loaded successfully!


RuntimeError: asyncio.run() cannot be called from a running event loop

In [84]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import joblib
import os

# Define base directory
import os

# When __file__ is not available, use the current working directory
BASE_DIR = os.getcwd()


# Load Model
MODEL_PATH = os.path.join(BASE_DIR, "fraud_detection_xgboost.pkl")
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Database Connection
DB_PATH = os.path.join(BASE_DIR, "test_logs.db")

# If the database does not exist, create it and the table
if not os.path.exists(DB_PATH):
    print("Creating new database...")
    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
    conn.commit()
    conn.close()

# Initialize FastAPI App
app = FastAPI()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Connect to database and log the prediction
        conn = sqlite3.connect(DB_PATH)
        c = conn.cursor()
        c.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

import subprocess

# Instead of using asyncio.run, run uvicorn as a subprocess
subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"])



✅ Model loaded successfully!


<Popen: returncode: None args: ['uvicorn', 'final_api:app', '--host', '0.0.0...>

In [82]:
import subprocess

# Instead of using asyncio.run, run uvicorn as a subprocess
subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"])


<Popen: returncode: None args: ['uvicorn', 'final_api:app', '--host', '0.0.0...>

In [2]:
# Save the updated API code into a file named `final_api.py`
api_code = """from fastapi import FastAPI, HTTPException
import subprocess
import pandas as pd
import sqlite3
import os
import joblib

# Define base directory (for Render or local)
BASE_DIR = os.getcwd()

# Load Model
MODEL_PATH = os.path.join(BASE_DIR, "fraud_detection_xgboost.pkl")
try:
    model = joblib.load(MODEL_PATH)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Database Connection
DB_PATH = os.path.join(BASE_DIR, "test_logs.db")
try:
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
    conn.commit()
    print("✅ Database connected and table initialized!")
except Exception as e:
    print(f"❌ Error setting up the database: {e}")

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Save to database
        c.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Start the API by running uvicorn as a subprocess
subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8001"])
"""

# Write the API code to `final_api.py`
with open("final_api.py", "w") as f:
    f.write(api_code)

print("✅ `final_api.py` saved successfully!")


✅ `final_api.py` saved successfully!


In [4]:
!lsof -i :8001


In [87]:
!kill -9 190

In [None]:
!kill -9 31 

In [5]:
lsof -i :8001


SyntaxError: illegal target for annotation (<ipython-input-5-c1819a628da9>, line 1)

In [7]:
import subprocess

result = subprocess.run(["lsof", "-i", ":8001"], capture_output=True, text=True)
print(result.stdout)





In [8]:
import requests

response = requests.get("http://0.0.0.0:8001/")  # Change the port if needed
print("Status Code:", response.status_code)
print("Response:", response.json())


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8001): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x78130114fdf0>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [9]:
import subprocess

# Stop any previous instances (optional if using another port)
subprocess.run(["pkill", "-f", "uvicorn"], check=False)

# Start API in the background
subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8001"])


<Popen: returncode: None args: ['uvicorn', 'final_api:app', '--host', '0.0.0...>

In [10]:
import requests

# Define the API base URL
API_URL = "http://0.0.0.0:8001/"

# Check API Status
try:
    response = requests.get(API_URL)
    print("✅ API Response:", response.json())
except requests.exceptions.ConnectionError:
    print("❌ API is NOT running! Please restart the API.")


✅ API Response: {'message': 'Fraud Detection API is running!'}


In [12]:
# Sample test transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request
try:
    response = requests.post(f"{API_URL}predict/", json=test_transaction)
    print("✅ Prediction Response:", response.json())
except requests.exceptions.ConnectionError:
    print("❌ Failed to connect to API! Check if it's running.")


✅ Prediction Response: {'fraud_prediction': 0}


In [13]:
import os

# Check which processes are using ports 8000 or 8001
os.system("lsof -i :8000")
os.system("lsof -i :8001")


0

In [14]:
!ps aux | grep uvicorn


root         190  0.1  0.0      0     0 ?        Z    15:39   0:11 [uvicorn] <defunct>
root         228  0.0  0.0      0     0 ?        Z    16:27   0:02 [uvicorn] <defunct>
root         240  0.0  0.0      0     0 ?        Z    16:30   0:02 [uvicorn] <defunct>
root         295  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         307  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         319  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         331  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         343  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         355  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         367  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         379  0.1  0.0      0     0 ?        Z    17:05   0:02 [uvicorn] <defunct>
root         391  0.1  0.0      0     0 ?  

In [16]:
import requests

try:
    response = requests.get("http://0.0.0.0:8001/")
    print("❌ API is still running:", response.json())
except requests.exceptions.ConnectionError:
    print("✅ API successfully stopped!")


❌ API is still running: {'message': 'Fraud Detection API is running!'}


In [17]:
!lsof -i :8001


COMMAND PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
python3 262 root   45u  IPv4 303466      0t0  TCP localhost:44322->localhost:8001 (CLOSE_WAIT)
uvicorn 283 root    7u  IPv4 243540      0t0  TCP *:8001 (LISTEN)


In [19]:
!kill -9 283


In [27]:
import requests

try:
    response = requests.get("http://0.0.0.0:8001/")
    print("❌ API is still running:", response.json())
except requests.exceptions.ConnectionError:
    print("✅ API successfully stopped!")


❌ API is still running: {'message': 'Fraud Detection API is running!'}


In [28]:
!lsof -i :8001


COMMAND   PID USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
python3   262 root   44u  IPv4 310205      0t0  TCP localhost:43382->localhost:8001 (CLOSE_WAIT)
uvicorn 12938 root    7u  IPv4 310823      0t0  TCP *:8001 (LISTEN)


In [26]:
!kill -9 12671

In [None]:
!lsof -t -i:8001 | xargs kill -9


In [2]:
!lsof -i:8001


In [3]:
import requests

try:
    response = requests.get("http://0.0.0.0:8001/")
    print("❌ API is still running:", response.json())
except requests.exceptions.ConnectionError:
    print("✅ API successfully stopped!")


✅ API successfully stopped!


In [5]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import joblib
import os
import subprocess

# Define base directory
BASE_DIR = os.getcwd()

# Load Model
MODEL_PATH = os.path.join(BASE_DIR, "fraud_detection_xgboost.pkl")
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Database Connection
DB_PATH = os.path.join(BASE_DIR, "logs.db")
def get_db_connection():
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    return conn

db_conn = get_db_connection()
c = db_conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
db_conn.commit()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]
        
        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Save to database
        db_conn = get_db_connection()
        cursor = db_conn.cursor()
        cursor.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        db_conn.commit()
        db_conn.close()

        return {"fraud_prediction": int(prediction[0])}
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run the API as a subprocess
if __name__ == "__main__":
    subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8002"])


✅ Model loaded successfully!


In [6]:
import subprocess

# Check which process is using port 8000 or 8001
subprocess.run(["lsof", "-i", ":8000"])
subprocess.run(["lsof", "-i", ":8001"])


CompletedProcess(args=['lsof', '-i', ':8001'], returncode=0)

In [7]:
import os
import uvicorn
import sqlite3
import joblib
import pandas as pd
from fastapi import FastAPI, HTTPException

# Initialize FastAPI app
app = FastAPI()

# Define base directory for portability
BASE_DIR = os.getcwd()

# Load Model
MODEL_PATH = os.path.join(BASE_DIR, "fraud_detection_xgboost.pkl")

try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Database path
DB_PATH = os.path.join(BASE_DIR, "fraud_logs.db")

# Ensure database and table exist
def init_db():
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS logs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            input TEXT,
            prediction INTEGER
        )
    ''')
    conn.commit()
    conn.close()
    print("✅ Database connected and table initialized!")

init_db()  # Call database initialization

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Open a connection inside the function scope
        conn = sqlite3.connect(DB_PATH, check_same_thread=False)
        cursor = conn.cursor()
        
        # Log input and prediction
        cursor.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Run API dynamically (PORT is assigned by Render, default to 8000)
if __name__ == "__main__":
    PORT = int(os.getenv("PORT", 8000))  # Get PORT from environment, default to 8000
    uvicorn.run(app, host="0.0.0.0", port=PORT)


✅ Model loaded successfully!
✅ Database connected and table initialized!


RuntimeError: asyncio.run() cannot be called from a running event loop

In [8]:
import os
import uvicorn
import sqlite3
import joblib
import pandas as pd
import subprocess
from fastapi import FastAPI, HTTPException

# Initialize FastAPI app
app = FastAPI()

# Define base directory
BASE_DIR = os.getcwd()

# Load Model
MODEL_PATH = os.path.join(BASE_DIR, "fraud_detection_xgboost.pkl")

try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Database path
DB_PATH = os.path.join(BASE_DIR, "fraud_logs.db")

# Ensure database and table exist
def init_db():
    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS logs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            input TEXT,
            prediction INTEGER
        )
    ''')
    conn.commit()
    conn.close()
    print("✅ Database connected and table initialized!")

init_db()  # Call database initialization

@app.get("/")
def home():
    return {"message": "🚀 Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Open a connection inside the function scope
        conn = sqlite3.connect(DB_PATH, check_same_thread=False)
        cursor = conn.cursor()
        
        # Log input and prediction
        cursor.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()
        conn.close()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# ✅ Run API with a dynamically assigned port
if __name__ == "__main__":
    PORT = int(os.getenv("PORT", 8000))  # Get PORT from environment, default to 8000
    subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", str(PORT)])
    print(f"🚀 API is running on port {PORT}")


✅ Model loaded successfully!
✅ Database connected and table initialized!
🚀 API is running on port 8000


In [9]:
api_code = """<PASTE THE UPDATED CODE ABOVE HERE>"""

# Save API file
api_path = "/kaggle/working/final_api.py"
with open(api_path, "w") as f:
    f.write(api_code)

print(f"✅ `final_api.py` saved successfully at {api_path}!")


✅ `final_api.py` saved successfully at /kaggle/working/final_api.py!


In [10]:
import requests

url = "http://0.0.0.0:8000/"  # Update if using a different port

try:
    response = requests.get(url)
    print("✅ API Status:", response.json())
except requests.exceptions.RequestException as e:
    print("❌ API is NOT running!", e)


✅ API Status: {'message': 'Fraud Detection API is running!'}


In [11]:
import requests

# Define a sample transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request to API
url = "http://0.0.0.0:8000/predict/"  # Update port if necessary
try:
    response = requests.post(url, json=test_transaction)
    print("✅ Prediction Response:", response.json())
except requests.exceptions.RequestException as e:
    print("❌ API Prediction Failed!", e)


✅ Prediction Response: {'fraud_prediction': 0}


In [None]:
import os

# Find the process ID (PID) running on port 8000 (or 8001 if needed)
os.system("lsof -t -i:8000 | xargs kill -9")  # Use the correct port
os.system("lsof -t -i:8001 | xargs kill -9")  # If running on 8001


In [1]:
os.system("lsof -i :8000")
os.system("lsof -i :8001")


NameError: name 'os' is not defined

In [2]:
import os

# Check which process is using port 8000 or 8001
os.system("lsof -i :8000")
os.system("lsof -i :8001")


256

In [3]:
# Kill the process using port 8000 or 8001
os.system("lsof -t -i:8000 | xargs kill -9")
os.system("lsof -t -i:8001 | xargs kill -9")

print("✅ API stopped successfully!")


✅ API stopped successfully!


In [4]:
import requests

API_URL = "https://finanance-fraud-detection-ml.onrender.com"  # Replace with your actual URL
response = requests.get(API_URL)
print(response.json())  # Should return {"message": "Fraud Detection API is running!"}

{'message': 'Fraud Detection API is running!'}


In [6]:
import requests

# Define a sample transaction
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Replace with your actual Render API URL
API_URL = "https://finanance-fraud-detection-ml.onrender.com/"

response = requests.post(API_URL, json=test_transaction)
print(response.json())  # Should return fraud_prediction


{'detail': 'Method Not Allowed'}


In [7]:
import requests

API_URL = "https://your-render-api-url.com/"  # Replace with your actual URL
response = requests.get(API_URL)
print(response.json())  # Expected Output: {"message": "Fraud Detection API is running!"}


ConnectionError: HTTPSConnectionPool(host='your-render-api-url.com', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7cdba0e300a0>: Failed to resolve 'your-render-api-url.com' ([Errno -2] Name or service not known)"))

In [8]:
import requests

API_URL = "https://your-render-api-url.com/predict/"  # Replace with your actual URL

# Define a sample transaction for fraud detection
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send POST request
response = requests.post(API_URL, json=test_transaction)

# Print the response
print(response.json())  # Expected Output: {'fraud_prediction': 0 or 1}


ConnectionError: HTTPSConnectionPool(host='your-render-api-url.com', port=443): Max retries exceeded with url: /predict/ (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7cdba0e31300>: Failed to resolve 'your-render-api-url.com' ([Errno -2] Name or service not known)"))

In [9]:
import requests

# Use the correct API endpoint
API_URL = "https://finanance-fraud-detection-ml.onrender.com/predict/"

# Sample fraud transaction data
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send the POST request
response = requests.post(API_URL, json=test_transaction)

# Print the API response
print(response.json())  # Should return {"fraud_prediction": 0 or 1}


{'detail': "Prediction error: 'XGBClassifier' object has no attribute 'use_label_encoder'"}


In [11]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import joblib
import os
import xgboost as xgb

# Load Model
MODEL_PATH = "/kaggle/working/fraud_detection_xgboost.pkl"
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        # ✅ FIX: Convert data to DMatrix before prediction
        dmatrix = xgb.DMatrix(df)

        prediction = model.predict(dmatrix)  # Use DMatrix for prediction
        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

import subprocess

if __name__ == "__main__":
    subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"])



✅ Model loaded successfully!


In [12]:
from fastapi import FastAPI, HTTPException
import uvicorn
import pandas as pd
import sqlite3
import joblib
import os
import subprocess

# Load Model
MODEL_PATH = "fraud_detection_xgboost.pkl"
try:
    with open(MODEL_PATH, "rb") as file:
        model = joblib.load(file)
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    model = None

# Initialize FastAPI App
app = FastAPI()

# Database Connection
DB_PATH = "logs.db"
conn = sqlite3.connect(DB_PATH, check_same_thread=False)
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS logs (id INTEGER PRIMARY KEY, input TEXT, prediction INTEGER)''')
conn.commit()

@app.get("/")
def home():
    return {"message": "Fraud Detection API is running!"}

@app.post("/predict/")
def predict(data: dict):
    try:
        # Define required features
        features = ["step", "amount", "isFlaggedFraud", "isMerchant", "amount_ratio", "type_encoded"]

        df = pd.DataFrame([data])
        df = df[features]  # Ensure correct feature order

        prediction = model.predict(df)

        # Save to database
        c.execute("INSERT INTO logs (input, prediction) VALUES (?, ?)", (str(data), int(prediction[0])))
        conn.commit()

        return {"fraud_prediction": int(prediction[0])}

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

if __name__ == "__main__":
    subprocess.Popen(["uvicorn", "final_api:app", "--host", "0.0.0.0", "--port", "8000"])


✅ Model loaded successfully!


In [17]:
import requests

# Use the correct API endpoint
API_URL = "https://finanance-fraud-detection-ml.onrender.com/predict/"

# Sample fraud transaction data
test_transaction = {
    "step": 1,
    "amount": 1000,
    "isFlaggedFraud": 0,
    "isMerchant": 1,
    "amount_ratio": 0.001,
    "type_encoded": 2
}

# Send the POST request
response = requests.post(API_URL, json=test_transaction)

# Print the API response
print(response.json())  # Should return {"fraud_prediction": 0 or 1}


{'fraud_prediction': 0}


In [15]:
import joblib

# Load your current model
model_path = "/kaggle/working/fraud_detection_xgboost.pkl"
model = joblib.load(model_path)

# Check model attributes
print(dir(model))  # Ensure use_label_encoder is NOT present


['_Booster', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_is_fitted__', '__str__', '__subclasshook__', '__weakref__', '_can_use_inplace_predict', '_check_feature_names', '_check_n_features', '_configure_fit', '_create_dmatrix', '_estimator_type', '_get_iteration_range', '_get_param_names', '_get_tags', '_get_type', '_load_model_attributes', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_set_evaluation_result', '_validate_data', '_validate_params', 'apply', 'base_score', 'best_iteration', 'best_score', 'booster', 'callbacks', 'classes_', 'coef_', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_met

In [16]:
import xgboost
print(xgboost.__version__)  # Note the version


2.0.3


In [18]:
import sqlite3

DB_PATH = "/kaggle/working/test_logs.db"  # Ensure this matches your API path

# Connect to database
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

# Check if logs table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='logs';")
table_exists = cursor.fetchone()
print("✅ Logs Table Exists" if table_exists else "❌ Logs Table Missing")

# Fetch last 10 records to verify logging
cursor.execute("SELECT * FROM logs ORDER BY id DESC LIMIT 10;")
logs = cursor.fetchall()
print("📌 Last Logged Predictions:", logs)

# Close connection
conn.close()


✅ Logs Table Exists
📌 Last Logged Predictions: [(3, "{'step': 1, 'amount': 1000, 'isFlaggedFraud': 0, 'isMerchant': 1, 'amount_ratio': 0.001, 'type_encoded': 2}", 0), (2, "{'step': 1, 'amount': 1000, 'isFlaggedFraud': 0, 'isMerchant': 1, 'amount_ratio': 0.001, 'type_encoded': 2}", 0), (1, "{'step': 1, 'amount': 1000, 'isFlaggedFraud': 0, 'isMerchant': 1, 'amount_ratio': 0.001, 'type_encoded': 2}", 0)]
