In [None]:
##Imports & Setup

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_cleaner import clean_data
from src.feature_engineer import create_features

%matplotlib inline
sns.set(style="whitegrid")


In [None]:

# Load and clean sample data
df = clean_data("data/sample_data.csv")
df = create_features(df)

df.head()


In [None]:
#Summary Stats
df.describe(include='all')


In [None]:
#RFM Histogram
fig, axes = plt.subplots(1, 3, figsize=(18, 4))
sns.histplot(df['Recency'], kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Recency Distribution')

sns.histplot(df['Frequency'], kde=True, ax=axes[1], color='salmon')
axes[1].set_title('Frequency Distribution')

sns.histplot(df['Monetary'], kde=True, ax=axes[2], color='lightgreen')
axes[2].set_title('Monetary (Revenue) Distribution')
plt.tight_layout()
plt.show()


In [None]:
#Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlations")
plt.show()


In [None]:
#Gender and Country vs Revenue
plt.figure(figsize=(12,5))
sns.boxplot(x='Gender', y='Monetary', data=df)
plt.title("Revenue by Gender")

plt.figure(figsize=(12,5))
sns.boxplot(x='Country', y='Monetary', data=df)
plt.title("Revenue by Country")
plt.show()


In [None]:
#Retention Insights(Recency Buckets)
df['RecencyBucket'] = pd.cut(df['Recency'], bins=[0,30,90,180,365,10000], 
                              labels=['<1M','1-3M','3-6M','6-12M','>1Y'])

recency_stats = df.groupby('RecencyBucket')['Monetary'].mean().reset_index()
sns.barplot(x='RecencyBucket', y='Monetary', data=recency_stats)
plt.title("Avg Revenue by Recency Bucket")
plt.show()
