In [None]:
# ------------------------------------------------------------
# EDA and Visualization for Income Data
# ------------------------------------------------------------
# Steps:
#   1. Read adult_income.csv
#   2. Show summary statistics
#   3. Create visualizations:
#        • Histogram of Age
#        • Box plot of Hours-per-week by Income
#        • Bar chart of Occupation counts
#   4. Save all plots and summary statistics to files
# ------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Read dataset
filename = "adult_income.csv"   # make sure this file is in the same folder
data = pd.read_csv(filename)

# Step 2: Show summary statistics
print("----- Summary Statistics -----\n")
summary = data.describe(include='all')
print(summary)

# Save summary statistics to CSV
summary.to_csv("eda_summary.csv")
print("\n✅ Summary statistics saved to 'eda_summary.csv'")

# Step 3a: Histogram of Age
plt.figure(figsize=(7, 5))
plt.hist(data['age'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("histogram_age.png")
plt.close()

# Step 3b: Box Plot of Hours-per-week by Income
if 'hours-per-week' in data.columns and 'income' in data.columns:
    plt.figure(figsize=(7, 5))
    data.boxplot(column='hours-per-week', by='income', grid=False)
    plt.title('Box Plot: Hours per Week by Income')
    plt.suptitle("")  # remove default title
    plt.xlabel('Income Category')
    plt.ylabel('Hours per Week')
    plt.tight_layout()
    plt.savefig("boxplot_hours_income.png")
    plt.close()
else:
    print("\n⚠ Columns 'hours-per-week' or 'income' not found in dataset!")

# Step 3c: Bar Chart of Occupation Counts
if 'occupation' in data.columns:
    plt.figure(figsize=(10, 6))
    data['occupation'].value_counts().plot(kind='bar', color='lightgreen', edgecolor='black')
    plt.title('Occupation Counts')
    plt.xlabel('Occupation')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig("barchart_occupation.png")
    plt.close()
else:
    print("\n⚠ Column 'occupation' not found in dataset!")

# Step 4: Display completion message
print("\n✅ All plots saved successfully:")
print(" - histogram_age.png")
print(" - boxplot_hours_income.png")
print(" - barchart_occupation.png")