In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 1. Data Collection
Load the dataset.

In [None]:
# Load the dataset - Using a sample to avoid MemoryError
# Reduced to 100,000 rows for development. Remove nrows argument for full training on powerful machines.
df = pd.read_csv('../data/PS_20174392719_1491204439457_log.csv', nrows=100000)
df.head()

# 2. Data Pre-processing
## 2.1 Removing unnecessary columns
## 2.2 Checking for null values

In [None]:
print("Shape before dropping:", df.shape)
# Check for null values
print("Null values:\n", df.isnull().sum())

# Removing unnecessary columns
df = df.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)

print("Shape after dropping:", df.shape)
df.head()

# 3. Visualizing and analyzing data

In [None]:
# 3.3 Descriptive Analysis
df.describe()

In [None]:
# 3.1 Univariate Analysis - Target Variable
sns.countplot(x='isFraud', data=df)
plt.title('Distribution of Fraud Transactions')
plt.show()
print(df['isFraud'].value_counts())

In [None]:
# 3.2 Bivariate Analysis - Type vs Fraud
plt.figure(figsize=(10,6))
sns.countplot(x='type', hue='isFraud', data=df)
plt.title('Transaction Type vs Fraud')
plt.show()

In [None]:
# Correlation Matrix (Numerical)
plt.figure(figsize=(10,6))
numeric_df = df.select_dtypes(include=['number'])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 4. Model Building
## 4.1 Handling categorical values
## 4.2 Dividing data into train and test sets
## 4.3 Import the model building libraries
## 4.4 Comparing the accuracy of various models
## 4.5 Hyperparameter tuning
## 4.6 Evaluating the performance
## 4.7 Save the model

In [None]:
# 4.1 Handling categorical values (One-Hot Encoding for 'type')
df = pd.get_dummies(df, columns=['type'], drop_first=True)
df.head()

In [None]:
# 4.2 Dividing data into train and test sets
X = df.drop('isFraud', axis=1)
y = df['isFraud']

from sklearn.model_selection import train_test_split
# Using a smaller test size to save memory if needed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4.3 Import libraries & 4.4 Compare Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# We prioritize lighter models first or use subset
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=10) # Significantly reduced estimators for Demo/Memory constraints
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print("-" * 30)

In [None]:
# 4.5 Hyperparameter Tuning & Selection
# Using the best performer (usually Random Forest) but keeping it lightweight
final_model = RandomForestClassifier(n_estimators=20, random_state=42)
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [None]:
# 4.6 Evaluating Performance
print("Final Model Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# 4.7 Save the model
import pickle
pickle.dump(final_model, open('payments.pkl', 'wb'))
pickle.dump(final_model, open('../flask/payments.pkl', 'wb')) # Save copy for flask app