In [None]:
# Cell 1 [Imports and Setup]
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# This ensures plots display inline in Jupyter
%matplotlib inline

# Import the data loading function from project module
from src.data_collection import get_sample_df


In [None]:
# Cell 2 [Load Data]
# Load the dataset using the provided function
df = get_sample_df()

# Display the first 5 rows of the DataFrame
display(df.head())


In [None]:
# Cell 3 [Basic Dataset Information]
# Display info about data types, non-null counts, and memory usage
df.info()

# Display basic statistical summary of numerical columns
df.describe()


In [None]:
# Cell 4 [Class Distribution]
# Show class distribution for 'label' column
print("Class Distribution:")
print(df['label'].value_counts())

# Plot bar chart of class distribution
df['label'].value_counts().plot(
    kind='bar',
    color=['skyblue', 'salmon']
)
plt.title("Class Distribution (0 = Legitimate, 1 = Suspicious)")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()


In [None]:
# Cell 5 [Distribution of Transaction Amount]
# Plot histogram of transaction amounts
plt.hist(df['amount'], bins=20, color='steelblue', edgecolor='black')
plt.title("Distribution of Transaction Amounts")
plt.xlabel("Transaction Amount")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Cell 6 [Transaction Hour Analysis]
# Convert timestamp column to datetime format if not already
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract hour of transaction
df['txn_hour'] = df['timestamp'].dt.hour

# Plot bar chart of number of transactions per hour
df['txn_hour'].value_counts().sort_index().plot(
    kind='bar',
    color='teal'
)
plt.title("Number of Transactions by Hour of Day")
plt.xlabel("Hour of Day (0-23)")
plt.ylabel("Number of Transactions")
plt.show()


In [None]:
# Cell 7 [Correlation Heatmap]
# Select numerical columns for correlation
num_cols = ['amount', 'txn_hour']
corr_matrix = df[num_cols].corr()

# Plot heatmap using matshow
plt.matshow(corr_matrix, cmap='coolwarm')
plt.colorbar()
plt.xticks(range(len(num_cols)), num_cols, rotation=45)
plt.yticks(range(len(num_cols)), num_cols)
plt.title("Correlation Heatmap", pad=20)
plt.show()


In [None]:
# Cell 8 [Top Senders and Receivers]
# Create subplot figure
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 10 senders by frequency
df['sender_id'].value_counts().head(10).plot(
    kind='bar',
    ax=axes[0],
    color='steelblue'
)
axes[0].set_title("Top 10 Senders")
axes[0].set_xlabel("Sender ID")
axes[0].set_ylabel("Frequency")

# Top 10 receivers by frequency
df['receiver_id'].value_counts().head(10).plot(
    kind='bar',
    ax=axes[1],
    color='darkorange'
)
axes[1].set_title("Top 10 Receivers")
axes[1].set_xlabel("Receiver ID")
axes[1].set_ylabel("Frequency")

plt.tight_layout()
plt.show()


In [None]:
# Cell 9 [Transaction Amount by Label]
# Boxplot of transaction amounts grouped by label
df.boxplot(column='amount', by='label', grid=False)
plt.title("Transaction Amount Distribution by Label")
plt.suptitle("")  # remove automatic 'Boxplot grouped by' title
plt.xlabel("Label (0 = Legitimate, 1 = Suspicious)")
plt.ylabel("Transaction Amount")
plt.show()
