In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Display settings for better visualization
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:

# Example of a mock dataset to demonstrate (you can replace this with your actual data)
data = pd.DataFrame({
    'TransactionId': [1, 2, 3, 4, 5, 6, 7, 8],
    'BatchId': [101, 102, 103, 104, 105, 106, 107, 108],
    'AccountId': [201, 202, 203, 204, 205, 206, 207, 208],
    'SubscriptionId': [301, 302, 303, 304, 305, 306, 307, 308],
    'CustomerId': [401, 402, 403, 404, 405, 406, 407, 408],
    'CurrencyCode': ['USD', 'EUR', 'USD', 'GBP', 'USD', 'EUR', 'USD', 'GBP'],
    'CountryCode': [1, 44, 1, 44, 1, 44, 1, 44],
    'ProviderId': [501, 502, 503, 504, 505, 506, 507, 508],
    'ProductId': [601, 602, 603, 604, 605, 606, 607, 608],
    'ProductCategory': ['Electronics', 'Fashion', 'Electronics', 'Books', 'Electronics', 'Fashion', 'Books', 'Electronics'],
    'ChannelId': ['Web', 'Android', 'IOS', 'PayLater', 'Checkout', 'Web', 'Android', 'IOS'],
    'Amount': [200, -150, 300, -50, 500, -300, 100, -250],
    'Value': [200, 150, 300, 50, 500, 300, 100, 250],
    'TransactionStartTime': pd.to_datetime(['2022-01-01 10:00', '2022-01-02 11:00', '2022-01-03 12:00', '2022-01-04 13:00', 
                                            '2022-01-05 14:00', '2022-01-06 15:00', '2022-01-07 16:00', '2022-01-08 17:00']),
    'PricingStrategy': ['Discount', 'FullPrice', 'Discount', 'FullPrice', 'Discount', 'FullPrice', 'Discount', 'FullPrice'],
    'FraudResult': [0, 1, 0, 0, 0, 1, 0, 0]  # 1 means fraud detected, 0 means no fraud
})

In [None]:

# Display basic information about the dataset
data.info()

In [None]:

# Display summary statistics of numerical features
data.describe()

In [None]:

# Visualize the distribution of numerical features
sns.histplot(data['Amount'], kde=True, color="blue")
plt.title('Distribution of Transaction Amount')
plt.show()

In [None]:
sns.histplot(data['Value'], kde=True, color="green")
plt.title('Distribution of Transaction Value')
plt.show()

In [None]:

# Visualize categorical features
sns.countplot(data=data, x='CurrencyCode', palette="Set2")
plt.title('Distribution of Currency Code')
plt.show()

In [None]:

sns.countplot(data=data, x='ProductCategory', palette="Set2")
plt.title('Distribution of Product Categories')
plt.show()

In [None]:

sns.countplot(data=data, x='ChannelId', palette="Set2")
plt.title('Distribution of Channels Used')
plt.show()

In [None]:

# Correlation analysis for numerical features
corr_matrix = data[['Amount', 'Value']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix between Amount and Value')
plt.show()

In [None]:

# Checking for missing values
missing_values = data.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

In [None]:

# Identifying outliers using boxplots for Amount and Value
plt.figure(figsize=(12, 6))
sns.boxplot(data=data[['Amount', 'Value']])
plt.title('Boxplot of Amount and Value')
plt.show()

In [None]:

# Visualizing Fraud Cases
sns.countplot(data=data, x='FraudResult', palette="Set1")
plt.title('Distribution of Fraud Results')
plt.show()

In [None]:

# Creating new feature: Transaction Duration (example)
data['TransactionDuration'] = (data['TransactionStartTime'] - data['TransactionStartTime'].min()).dt.total_seconds()


In [None]:

# Visualizing the new feature
sns.histplot(data['TransactionDuration'], kde=True, color="purple")
plt.title('Distribution of Transaction Duration')
plt.show()

In [None]:

# Display the first few rows of the dataset to get a sense of the data
data.head()