# Explore Raw Data
Load and analyze synthetic raw data sources for cross-border e-commerce risk control.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load raw data
raw_path = Path('data/raw')
df_transactions = pd.read_csv(raw_path / 'transactions.csv')
df_users = pd.read_csv(raw_path / 'users.csv')
df_merchants = pd.read_csv(raw_path / 'merchants.csv')
print('Transactions shape:', df_transactions.shape)
print('Users shape:', df_users.shape)
print('Merchants shape:', df_merchants.shape)

In [None]:
# Basic stats for transactions
print(df_transactions.describe())
print('
Missing values in transactions:
', df_transactions.isnull().sum())

# Visualize amount distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_transactions['amount'], bins=50)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Cross-border transactions
cross_border = df_transactions[df_transactions['is_cross_border'] == True]
print(f'Cross-border transactions: {len(cross_border)} / {len(df_transactions)} ({len(cross_border)/len(df_transactions)*100:.1f}%)')

# User risk tier distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df_users, x='user_risk_tier')
plt.title('User Risk Tier Distribution')
plt.show()