In [None]:

# ----------------------------
# Import libraries
# ----------------------------
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Set non-interactive backend before pyplot import
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# ----------------------------
# 1. Data Collection (Simulated)
# ----------------------------

# Generate a date range for simulated data
dates = pd.date_range('2023-01-01', '2023-12-31')

# Simulate customer demographic data
customers = pd.DataFrame({
    'Customer_ID': range(100),
    'Age': np.random.randint(18, 70, 100),
    'Gender': np.random.choice(['M', 'F'], 100),
    'Location': np.random.choice(['New York', 'Texas', 'California'], 100),
    'Join_Date': np.random.choice(dates, 100)
})

# Simulate store details
stores = pd.DataFrame({
    'Store_ID': [1, 2, 3, 4],
    'Location': ['New York', 'Texas', 'California', 'Texas'],
    'Size_sqft': [2000, 1500, 1800, 2200],
    'Category': ['Electronics', 'Apparel', 'Electronics', 'Apparel']
})

# Simulate sales data with store and customer references
sales_data = pd.DataFrame({
    'Transaction_ID': range(1000),
    'Store_ID': np.random.choice(stores['Store_ID'], 1000),
    'Customer_ID': np.random.choice(customers['Customer_ID'], 1000),
    'Product_ID': np.random.choice(['A', 'B', 'C', 'D'], 1000),
    'Date': np.random.choice(dates, 1000),
    'Quantity': np.random.randint(1, 5, 1000),
    'Price': np.random.uniform(10, 100, 1000)
})

# ----------------------------
# 2. Data Preprocessing
# ----------------------------

# Add a new column for total sales (Quantity * Price)
sales_data['Total_Sales'] = sales_data['Quantity'] * sales_data['Price']

# Merge sales, store, and customer data for combined analysis
merged_data = (
    sales_data
    .merge(stores, on='Store_ID')
    .merge(customers, on='Customer_ID', suffixes=('_store', '_customer'))
)

# Add a holiday flag for Christmas and Black Friday
merged_data['Is_Holiday'] = merged_data['Date'].isin([
    '2023-12-25', '2023-11-24'  # Christmas, Black Friday
]).astype(int)

# Simulate weather data (temperature and rainfall)
merged_data['Temperature'] = np.random.normal(70, 15, len(merged_data))  # Mean=70°F, Std=15
merged_data['Rainfall'] = np.random.exponential(0.2, len(merged_data))   # Exponential distribution

# Group customer data to compute average spending and purchase frequency
customer_metrics = merged_data.groupby('Customer_ID').agg(
    Avg_Spend=('Total_Sales', 'mean'),
    Purchase_Frequency=('Transaction_ID', 'count')
).reset_index()

# Calculate sales per square foot for store performance evaluation
merged_data['Sales_per_sqft'] = merged_data['Total_Sales'] / merged_data['Size_sqft']

# ----------------------------
# 3. Exploratory Data Analysis
# ----------------------------

# Plot 1: Monthly sales trends with holiday impact
plt.figure(figsize=(12, 6))
sns.lineplot(
    x=merged_data['Date'].dt.month,
    y='Total_Sales',
    hue='Is_Holiday',
    data=merged_data,
    estimator='sum'
)
plt.title('Monthly Sales Trends with Holiday Impact')
plt.xlabel('Month')
plt.ylabel('Total Sales ($)')
plt.savefig('sales_trends.png')  # Saved to a file instead of displaying
plt.close()

# Explanation: This plot shows the total monthly sales, distinguishing holiday vs. non-holiday sales. 
# Insight: Helps identify months with peak sales and the impact of holidays on revenue.

# Plot 2: Customer age, gender, and spending relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Age',
    y='Total_Sales',
    hue='Gender',
    data=merged_data,
    palette='viridis',
    alpha=0.7
)
plt.title('Customer Age/Gender vs Spending')
plt.ylabel('Total Sales per Transaction ($)')
plt.savefig('customer_segmentation.png')
plt.close()

# Explanation: This scatterplot analyzes the spending patterns across different age groups and genders.
# Insight: Useful for understanding customer demographics and tailoring marketing strategies.

# ----------------------------
# 4. Advanced Analysis
# ----------------------------

# Regression Analysis: Impact of holiday, store size, and temperature on sales
X = merged_data[['Is_Holiday', 'Size_sqft', 'Temperature']].astype(float)
X = sm.add_constant(X)  # Add a constant term
y = merged_data['Total_Sales'].astype(float)

# Drop NA values
valid_data = X.join(y).dropna()
X_clean = valid_data[X.columns]
y_clean = valid_data['Total_Sales']

# Fit regression model
model = sm.OLS(y_clean, X_clean).fit()
print("
Regression Results:")
print(model.summary())

# Explanation: The regression evaluates the effect of holidays, store size, and temperature on sales.
# Insight: Provides statistical significance and coefficients for each factor.

# Customer Clustering
scaler = StandardScaler()
scaled_metrics = scaler.fit_transform(customer_metrics[['Avg_Spend', 'Purchase_Frequency']])

kmeans = KMeans(n_clusters=3, random_state=42)
customer_metrics['Cluster'] = kmeans.fit_predict(scaled_metrics)

# Plot 3: Customer segmentation clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='Avg_Spend',
    y='Purchase_Frequency',
    hue='Cluster',
    data=customer_metrics,
    palette='Set1',
    s=100
)
plt.title('Customer Segmentation Clusters')
plt.xlabel('Average Spend ($)')
plt.ylabel('Purchase Frequency (count)')
plt.savefig('customer_clusters.png')
plt.close()

# Explanation: This plot visualizes customer segmentation into clusters based on spending and frequency.
# Insight: Helps identify high-value customers and tailor loyalty programs.

# ----------------------------
# 5. Actionable Insights
# ----------------------------

# Category Performance Analysis
category_performance = merged_data.groupby(['Location_store', 'Category'])['Total_Sales'].sum().unstack()
category_performance['Electronics_Apparel_Ratio'] = (
    category_performance['Electronics'] / category_performance['Apparel']
)

print("
Category Performance Analysis:")
print(category_performance)

# Explanation: Displays sales performance by store location and category (Electronics vs. Apparel).
# Insight: Helps identify which categories perform better in different locations.

# Temperature Impact Analysis
merged_data['Temp_Bucket'] = pd.cut(merged_data['Temperature'], bins=5)
weather_impact = merged_data.groupby('Temp_Bucket')['Total_Sales'].mean()

print("
Temperature Impact Analysis:")
print(weather_impact)

# Explanation: Analyzes sales trends under varying temperature conditions.
# Insight: Helps forecast sales during different weather conditions.
