In [None]:
# ------------------------------------------------------------
# Sales Data Correlation Study
# ------------------------------------------------------------
# Steps:
#   1. Read bigmart_sales.csv
#   2. Show summary statistics
#   3. Create visualizations:
#        • Histogram of Item_MRP
#        • Histogram of Item_Visibility
#        • Scatter plot of Item_MRP vs Item_Visibility
#        • Spearman correlation heatmap
#   4. List top 10 most correlated variable pairs
#   5. Save all plots and summary statistics to one file (eda_summary.csv)
# ------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Read dataset
filename = "bigmart_sales.csv"   # make sure this file is in the same folder
data = pd.read_csv(filename)

# Step 2: Show summary statistics
print("----- Summary Statistics -----\n")
summary = data.describe(include='all')
print(summary)

# Step 3a: Histogram of Item_MRP
if 'Item_MRP' in data.columns:
    plt.figure(figsize=(7, 5))
    plt.hist(data['Item_MRP'].dropna(), bins=20, color='skyblue', edgecolor='black')
    plt.title('Histogram of Item MRP')
    plt.xlabel('Item_MRP')
    plt.ylabel('Frequency')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig("histogram_item_mrp.png")
    plt.close()
else:
    print("\n⚠ Column 'Item_MRP' not found in dataset!")

# Step 3b: Histogram of Item_Visibility
if 'Item_Visibility' in data.columns:
    plt.figure(figsize=(7, 5))
    plt.hist(data['Item_Visibility'].dropna(), bins=20, color='lightgreen', edgecolor='black')
    plt.title('Histogram of Item Visibility')
    plt.xlabel('Item_Visibility')
    plt.ylabel('Frequency')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig("histogram_item_visibility.png")
    plt.close()
else:
    print("\n⚠ Column 'Item_Visibility' not found in dataset!")

# Step 3c: Scatter plot of Item_MRP vs Item_Visibility
if {'Item_MRP', 'Item_Visibility'}.issubset(data.columns):
    plt.figure(figsize=(7, 5))
    plt.scatter(data['Item_MRP'], data['Item_Visibility'], alpha=0.6, color='tomato', edgecolor='black')
    plt.title('Scatter Plot: Item_MRP vs Item_Visibility')
    plt.xlabel('Item_MRP')
    plt.ylabel('Item_Visibility')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig("scatter_mrp_visibility.png")
    plt.close()
else:
    print("\n⚠ Columns 'Item_MRP' or 'Item_Visibility' not found in dataset!")

# Step 3d: Spearman Correlation Heatmap
numeric_data = data.select_dtypes(include='number')
corr_matrix = numeric_data.corr(method='spearman')

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Spearman Correlation Heatmap")
plt.tight_layout()
plt.savefig("heatmap_spearman_correlation.png")
plt.close()

# Step 4: List top 10 most correlated variable pairs
corr_pairs = (
    corr_matrix.unstack()
    .dropna()
    .sort_values(ascending=False)
    .reset_index()
)
corr_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']
# remove duplicates (A,B) and (B,A)
corr_pairs = corr_pairs[corr_pairs['Variable 1'] < corr_pairs['Variable 2']]
top10_corr = corr_pairs.head(10)

print("\n----- Top 10 Most Related Variable Pairs -----\n")
print(top10_corr)

# Step 5: Save summary + correlations + plot info in one CSV
summary_out = summary.copy()
summary_out.loc['Plot Files'] = [
    "histogram_item_mrp.png, histogram_item_visibility.png, scatter_mrp_visibility.png, heatmap_spearman_correlation.png"
] + [""] * (len(summary_out.columns) - 1)

summary_out.to_csv("eda_summary.csv", index=True)
top10_corr.to_csv("top10_correlations.csv", index=False)

# Step 6: Display completion message
print("\n✅ All plots and summary saved successfully!")
print(" - eda_summary.csv (includes summary + plot file names)")
print(" - top10_correlations.csv (Top correlated pairs)")
print(" - histogram_item_mrp.png")
print(" - histogram_item_visibility.png")
print(" - scatter_mrp_visibility.png")
print(" - heatmap_spearman_correlation.png")