In [None]:
import pandas as pd
import os
import zipfile
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

zip_path = "/content/PS_20174392719_1491204439457_log.csv.zip"
extract_path = "/content"
csv_path = "/content/PS_20174392719_1491204439457_log.csv"

if not os.path.exists('/content'):
    os.makedirs('/content')

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

df = pd.read_csv(csv_path)
display(df.head())
display(df.info())

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='type', order=df['type'].value_counts().index)
plt.title('Distribution of Transaction Types')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

fraud_types = df[df['isFraud'] == 1]['type'].value_counts()
print("\nTransaction types associated with fraud:")
print(fraud_types)

filtered_df = df[df['type'].isin(['TRANSFER', 'CASH_OUT'])].copy()
filtered_df['errorBalanceOrg'] = filtered_df['oldbalanceOrg'] - filtered_df['amount'] - filtered_df['newbalanceOrig']
filtered_df['errorBalanceDest'] = filtered_df['newbalanceDest'] - filtered_df['oldbalanceDest'] - filtered_df['amount']
filtered_df['hourOfDay'] = filtered_df['step'] % 24

X = filtered_df.drop('isFraud', axis=1)
y = filtered_df['isFraud']

categorical_features = ['type']
numerical_features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
                      'oldbalanceDest', 'newbalanceDest', 'hourOfDay',
                      'errorBalanceOrg', 'errorBalanceDest']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='drop')

X_processed = preprocessor.fit_transform(X)
onehot_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = np.concatenate([numerical_features, onehot_feature_names])
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names, index=X.index)

X_train, X_test, y_train, y_test = train_test_split(
    X_processed_df, y, test_size=0.2, random_state=42, stratify=y
)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

lr_model = LogisticRegression(solver='liblinear', random_state=42)
lr_model.fit(X_train_smote, y_train_smote)
y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
print("\n--- Logistic Regression Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, y_proba_lr):.4f}")
print(classification_report(y_test, y_pred_lr))

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
print("\n--- Random Forest Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, y_proba_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42, n_jobs=-1)
xgb_model.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
print("\n--- XGBoost Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, y_proba_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_smote, y_train_smote)
y_pred_dt = dt_model.predict(X_test)
y_proba_dt = dt_model.predict_proba(X_test)[:, 1]
print("\n--- Decision Tree Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, y_proba_dt):.4f}")
print(classification_report(y_test, y_pred_dt))

print("\n\n--- Anomaly Detection (Method 1: Outlier Detection) ---")
contamination_rate = y_train.value_counts(normalize=True)[1]
print(f"Setting contamination/nu rate to: {contamination_rate:.4f}")

iso_forest_1 = IsolationForest(contamination=contamination_rate, random_state=42, n_jobs=-1)
iso_forest_1.fit(X_train)
y_pred_iso_1 = iso_forest_1.predict(X_test)
y_pred_iso_mapped_1 = np.where(y_pred_iso_1 == -1, 1, 0)
iso_scores_1 = iso_forest_1.decision_function(X_test)
print("\n--- Isolation Forest (Method 1) Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, -iso_scores_1):.4f}")
print(classification_report(y_test, y_pred_iso_mapped_1))

X_train_sample = X_train.sample(frac=0.1, random_state=42)
one_class_svm_1 = OneClassSVM(nu=contamination_rate, kernel='rbf', gamma='auto')
one_class_svm_1.fit(X_train_sample)
y_pred_svm_1 = one_class_svm_1.predict(X_test)
y_pred_svm_mapped_1 = np.where(y_pred_svm_1 == -1, 1, 0)
svm_scores_1 = one_class_svm_1.decision_function(X_test)
print("\n--- One-Class SVM (Method 1) Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, -svm_scores_1):.4f}")
print(classification_report(y_test, y_pred_svm_mapped_1))


print("\n\n--- Anomaly Detection (Method 2: Novelty Detection) ---")
print(f"Setting contamination/nu rate to: {contamination_rate:.4f}")
X_train_normal = X_train[y_train == 0]

iso_forest_2 = IsolationForest(contamination=contamination_rate, random_state=42, n_jobs=-1)
iso_forest_2.fit(X_train_normal)
y_pred_iso_2 = iso_forest_2.predict(X_test)
y_pred_iso_mapped_2 = np.where(y_pred_iso_2 == -1, 1, 0)
iso_scores_2 = iso_forest_2.decision_function(X_test)
print("\n--- Isolation Forest (Method 2) Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, -iso_scores_2):.4f}")
print(classification_report(y_test, y_pred_iso_mapped_2))

X_train_normal_sample = X_train_normal.sample(frac=0.1, random_state=42)
one_class_svm_2 = OneClassSVM(nu=contamination_rate, kernel='rbf', gamma='auto')
one_class_svm_2.fit(X_train_normal_sample)
y_pred_svm_2 = one_class_svm_2.predict(X_test)
y_pred_svm_mapped_2 = np.where(y_pred_svm_2 == -1, 1, 0)
svm_scores_2 = one_class_svm_2.decision_function(X_test)
print("\n--- One-Class SVM (Method 2) Evaluation ---")
print("AUPRC:", f"{average_precision_score(y_test, -svm_scores_2):.4f}")
print(classification_report(y_test, y_pred_svm_mapped_2))



In [None]:
fraud_df = df[df['isFraud'] == 1]

plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_df, x='type', order=fraud_df['type'].value_counts().index)
plt.title('Distribution of Fraudulent Transaction Types')
plt.xlabel('Transaction Type')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(filtered_df['amount'], bins=50, log=True)
plt.xscale('log')
plt.title('Distribution of Transaction Amounts (Log Scale)')
plt.xlabel('Transaction Amount (Log Scale)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=filtered_df, y='amount', x='isFraud')
plt.title('Distribution of Transaction Amounts by Fraud Status')
plt.xlabel('Is Fraudulent (0: No, 1: Yes)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=filtered_df, y='amount', x='isFraud', inner='quartile')
plt.title('Distribution of Transaction Amounts by Fraud Status (Violin Plot)')
plt.xlabel('Is Fraudulent (0: No, 1: Yes)')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
sample_df = filtered_df.sample(n=10000, random_state=42) 
plt.figure(figsize=(12, 8))
sns.scatterplot(data=sample_df, x='oldbalanceOrg', y='newbalanceOrig', hue='isFraud', alpha=0.6, s=10)
plt.title('Originator Balance: Old vs. New (with Fraud Highlight) - Sampled Data')
plt.xlabel('Old Balance (Originator)')
plt.ylabel('New Balance (Originator)')
plt.xscale('log')
plt.yscale('log')
plt.show()



In [None]:
plt.figure(figsize=(12, 8))
plt.hexbin(np.log10(filtered_df_nonzero['oldbalanceOrg']), np.log10(filtered_df_nonzero['newbalanceOrig']), gridsize=100, cmap='Blues', norm='log')
plt.title('Originator Balance: Old vs. New (Hex Plot - Log10 Transformed)')
plt.xlabel('Log10(Old Balance (Originator))')
plt.ylabel('Log10(New Balance (Originator))')
plt.colorbar(label='Transaction Count (Log Scale)')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=filtered_df[filtered_df['isFraud'] == 0], x='errorBalanceDest', bins=50, alpha=0.7, label='Normal', stat='density', common_norm=False)
sns.histplot(data=filtered_df[filtered_df['isFraud'] == 1], x='errorBalanceDest', bins=50, alpha=0.7, label='Fraudulent', stat='density', common_norm=False)
plt.title('Distribution of errorBalanceDest by Fraud Status (Density Plot)')
plt.xlabel('Error in Destination Balance')
plt.ylabel('Density')
plt.xlim([-10000, 10000]) 
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=filtered_df[filtered_df['isFraud'] == 1], x='errorBalanceDest', bins=50, alpha=0.7, label='Fraudulent', color='orangered')
plt.title('Distribution of errorBalanceDest for Fraudulent Transactions')
plt.xlabel('Error in Destination Balance')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=filtered_df[filtered_df['isFraud'] == 0], x='errorBalanceDest', bins=50, alpha=0.7, label='Normal', stat='density', common_norm=False)
sns.histplot(data=filtered_df[filtered_df['isFraud'] == 1], x='errorBalanceDest', bins=50, alpha=0.7, label='Fraudulent', stat='density', common_norm=False, color='orangered')
plt.title('Distribution of errorBalanceDest by Fraud Status (Density Plot)')
plt.xlabel('Error in Destination Balance')
plt.ylabel('Density')
plt.xlim([-5e6, 5e6]) 
plt.legend()
plt.show()

## Visualize transaction count by hour of day

### Subtask:
Create a line plot showing the number of transactions per hour of the day.


**Reasoning**:
I need to group the filtered dataframe by 'hourOfDay' and count the transactions to create a line plot showing the number of transactions per hour.



In [None]:
transactions_per_hour = filtered_df.groupby('hourOfDay').size()

plt.figure(figsize=(12, 6))
plt.plot(transactions_per_hour.index, transactions_per_hour.values)
plt.title('Transaction Count by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Transactions')
plt.xticks(range(0, 24)) # Ensure all hours from 0 to 23 are displayed on the x-axis
plt.grid(True)
plt.show()

## Visualize fraudulent transaction count by hour of day

### Subtask:
Create a line plot showing the number of fraudulent transactions per hour of the day.


**Reasoning**:
Filter the DataFrame for fraudulent transactions, group by hour, count, and then create the line plot according to the instructions.



In [None]:
fraud_by_hour = filtered_df[filtered_df['isFraud'] == 1].groupby('hourOfDay').size()

plt.figure(figsize=(12, 6))
plt.plot(fraud_by_hour.index, fraud_by_hour.values)
plt.title('Fraudulent Transaction Count by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Fraudulent Transactions')
plt.xticks(range(0, 24)) # Ensure all hours from 0 to 23 are displayed on the x-axis
plt.grid(True)
plt.show()

## Visualize correlation matrix

### Subtask:
Generate a heatmap of the correlation matrix of the numerical features to understand the relationships between variables.


**Reasoning**:
Calculate the correlation matrix for the numerical features and create a heatmap to visualize the relationships.



In [None]:
correlation_matrix = X_processed_df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

## Summary:

### Data Analysis Key Findings

*   The distribution of transaction types is skewed, with certain types occurring more frequently than others.
*   Fraudulent transactions are concentrated within specific transaction types, namely CASH_OUT and TRANSFER.
*   Transaction amounts have a wide range, necessitating logarithmic scales for effective visualization of their distribution.
*   The distribution of transaction amounts differs significantly between fraudulent and non-fraudulent transactions, with fraudulent transactions tending to have a different amount profile.
*   Visualizing the relationship between old and new balances for both originator and destination accounts, particularly using density plots or log transformations, reveals patterns in how balances change during transactions.
*   The calculated error in originator and destination balances (`errorBalanceOrg` and `errorBalanceDest`) shows distinct distributions for fraudulent transactions compared to normal ones. `errorBalanceDest` for normal transactions is heavily concentrated at zero, while fraudulent transactions show a wider spread.
*   The volume of transactions varies throughout the hour of the day, showing peaks and troughs.
*   Fraudulent transactions also show a pattern related to the hour of the day, concentrating during specific hours.
*   The correlation matrix reveals the linear relationships between numerical features, highlighting potentially correlated variables.

### Insights or Next Steps

*   Focus fraud detection efforts on CASH\_OUT and TRANSFER transaction types, as these account for the majority of fraudulent activity.
*   Investigate the distinct patterns observed in transaction amounts and balance errors for fraudulent transactions further, as these features appear to be strong indicators of fraud.
