# Load data

# Exploratory data analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
# Assuming you have a DataFrame named 'data' containing the relevant information
# Adjust the file path and column names as per your data structure

# Example code for loading data:
# data = pd.read_csv('your_data.csv')

# Perform EDA
# Count of disputes and disputers
total_disputes = data['dispute'].sum()
total_disputers = data['user_id'].nunique()
dispute_rate_payments = total_disputes / data['payment_id'].nunique()
dispute_rate_users = total_disputers / data['user_id'].nunique()

# Total amount of disputes in USD
total_dispute_amount_usd = data[data['dispute'] == 1]['amount_usd'].sum()
total_amount_usd = data['amount_usd'].sum()
usd_ratio = total_dispute_amount_usd / total_amount_usd

# Visualize dispute rate on day 7, day 28, and day 60
dispute_rate_by_day = data.groupby('day')['dispute'].mean().reset_index()
sns.barplot(x='day', y='dispute', data=dispute_rate_by_day)
plt.title('Dispute Rate by Day')
plt.xlabel('Day')
plt.ylabel('Dispute Rate')
plt.show()

In [None]:


# Evolution over time
# Assuming you have a column 'date' representing the date of each transaction
data['date'] = pd.to_datetime(data['date'])
data['date_month'] = data['date'].dt.to_period('M')
evolution_disputes_over_time = data.groupby('date_month')['dispute'].mean().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(x='date_month', y='dispute', data=evolution_disputes_over_time)
plt.title('Evolution of Dispute Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Dispute Rate')
plt.xticks(rotation=45)
plt.show()

# Correlation analysis
# Assuming all variables are already converted into binary or broken down into quantiles
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Calculate dispute rate across each dummy variable
dummy_variables = ['dummy_var1', 'dummy_var2', ...]  # List of dummy variables
dispute_rate_by_dummy = {}
for var in dummy_variables:
    dispute_rate_by_dummy[var] = data.groupby(var)['dispute'].mean()

# Rank dummy variables by dispute rate
sorted_dummy_vars = sorted(dispute_rate_by_dummy.items(), key=lambda x: x[1], reverse=True)
print("Ranking of dummy variables by dispute rate:")
for var, dispute_rate in sorted_dummy_vars:
    print(f"{var}: {dispute_rate}")

# Time between payments and instalments
# Calculate time differences between payment and instalment
data['time_diff'] = (data['instalment_date'] - data['payment_date']).dt.days

# Summary statistics and histogram
print("Summary statistics of time differences:")
print(data['time_diff'].describe())
plt.figure(figsize=(10, 6))
sns.histplot(data['time_diff'], bins=20, kde=True)
plt.title('Distribution of Time Differences')
plt.xlabel('Time Difference (days)')
plt.ylabel('Frequency')
plt.show()
