In [None]:
# ------------------------------------------------------------
# EDA and Visualization for Recipe (Beer) Data
# ------------------------------------------------------------
# Steps:
#   1. Read recipe_data.csv
#   2. Show summary statistics
#   3. Create visualizations:
#        • Histogram of ABV
#        • Box plot of IBU by Style
#        • Bar chart of top 10 Beer Styles
#   4. Save all plots and summary to a single CSV (eda_summary.csv)
# ------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Read dataset
filename = "recipe_data.csv"  # ensure the file is in the same folder
data = pd.read_csv(filename)

# Step 2: Summary statistics
print("----- Summary Statistics -----\n")
summary = data.describe(include='all')
print(summary)

# Save numeric summary to CSV
summary.to_csv("eda_summary.csv")

# Step 3a: Histogram of ABV (Alcohol By Volume)
plt.figure(figsize=(7, 5))
plt.hist(data['ABV'].dropna(), bins=20, color='lightblue', edgecolor='black')
plt.title("Histogram of ABV (Alcohol By Volume)")
plt.xlabel("ABV (%)")
plt.ylabel("Frequency")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("histogram_abv.png")
plt.close()

# Step 3b: Box Plot of IBU by Style
if 'IBU' in data.columns and 'Style' in data.columns:
    plt.figure(figsize=(10, 6))
    data.boxplot(column='IBU', by='Style', grid=False)
    plt.title("Box Plot: IBU by Style")
    plt.suptitle("")  # remove default title
    plt.xlabel("Beer Style")
    plt.ylabel("IBU (Bitterness)")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig("boxplot_ibu_style.png")
    plt.close()
else:
    print("\n⚠ Columns 'IBU' or 'Style' not found in dataset!")

# Step 3c: Bar Chart of Top 10 Beer Styles
if 'Style' in data.columns:
    top_styles = data['Style'].value_counts().head(10)
    plt.figure(figsize=(8, 5))
    top_styles.plot(kind='bar', color='orange', edgecolor='black')
    plt.title("Top 10 Beer Styles")
    plt.xlabel("Style")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig("barchart_top10_styles.png")
    plt.close()
else:
    print("\n⚠ Column 'Style' not found in dataset!")

# Step 4: Log summary of plots into the same CSV
with open("eda_summary.csv", "a") as f:
    f.write("\n\n--- Visualization Files ---\n")
    f.write("histogram_abv.png\n")
    f.write("boxplot_ibu_style.png\n")
    f.write("barchart_top10_styles.png\n")

print("\n✅ All outputs saved successfully:")
print(" - eda_summary.csv (includes summary + plot info)")
print(" - histogram_abv.png")
print(" - boxplot_ibu_style.png")
print(" - barchart_top10_styles.png")