In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
pd.set_option('display.max_columns', None)


In [None]:
file_path = '../../data/raw/transaction_data.csv'
transaction_data = pd.read_csv(file_path)
print("Basic Info of Transaction Data:")
transaction_data.info()

In [None]:
print("\nFirst 5 Rows of Transaction Data:")
print(transaction_data.head())

In [None]:
print("\nMissing Values Count in Each Column:")
print(transaction_data.isnull().sum())

In [None]:
print("\nDescriptive Statistics for Numeric Columns:")
print(transaction_data.describe())

In [None]:
print("\nUnique Values in Each Categorical Column:")
categorical_columns = transaction_data.select_dtypes(include='object').columns
for col in categorical_columns:
    print(f"Unique values in {col}: {transaction_data[col].nunique()}")

In [None]:
print("\nChecking for Duplicates:")
print(f"Number of duplicated rows: {transaction_data.duplicated().sum()}")

In [None]:
print("\nHandling Missing Values:")
transaction_data.fillna(transaction_data.median(), inplace=True)
transaction_data.drop_duplicates(inplace=True)
print(f"Number of duplicated rows after removal: {transaction_data.duplicated().sum()}")

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(transaction_data.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(transaction_data['transaction_amount'], kde=True, color='blue', bins=30)
plt.title('Distribution of Transaction Amount')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=transaction_data['transaction_amount'])
plt.title('Boxplot of Transaction Amount')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='transaction_type', y='transaction_amount', data=transaction_data)
plt.title('Transaction Amount by Transaction Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
transaction_data['transaction_date'] = pd.to_datetime(transaction_data['transaction_date'], format='%Y-%m-%d')
transaction_data['year'] = transaction_data['transaction_date'].dt.year
transaction_data['month'] = transaction_data['transaction_date'].dt.month
transaction_data['day'] = transaction_data['transaction_date'].dt.day

In [None]:
transactions_per_day = transaction_data.groupby('transaction_date').size()
plt.figure(figsize=(12, 6))
transactions_per_day.plot()
plt.title('Number of Transactions Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Transactions')
plt.grid(True)
plt.show()

In [None]:
daily_transaction_amount = transaction_data.groupby('transaction_date')['transaction_amount'].sum()
plt.figure(figsize=(12, 6))
daily_transaction_amount.plot()
plt.title('Daily Total Transaction Amount Over Time')
plt.xlabel('Date')
plt.ylabel('Total Transaction Amount')
plt.grid(True)
plt.show()

In [None]:
top_10_transaction_types = transaction_data['transaction_type'].value_counts().head(10)
plt.figure(figsize=(10, 6))
top_10_transaction_types.plot(kind='bar')
plt.title('Top 10 Most Frequent Transaction Types')
plt.xlabel('Transaction Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
correlation_matrix = transaction_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
if 'customer_segment' in transaction_data.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='customer_segment', y='transaction_amount', data=transaction_data)
    plt.title('Transaction Amount by Customer Segment')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
monthly_transaction_amount = transaction_data.groupby(['year', 'month'])['transaction_amount'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x='month', y='transaction_amount', hue='year', data=monthly_transaction_amount, marker='o')
plt.title('Monthly Transaction Amount Trends Across Years')
plt.xlabel('Month')
plt.ylabel('Transaction Amount')
plt.show()

In [None]:
output_file_path = '../../data/processed/cleaned_transaction_data.csv'
transaction_data.to_csv(output_file_path, index=False)
print("\nCleaned Transaction Data saved to:", output_file_path)