In [None]:
# ------------------------------------------------------------
# EDA and Visualization for Weather Data
# ------------------------------------------------------------
# Steps:
#   1. Read weather_aus.csv
#   2. Show summary statistics
#   3. Create visualizations:
#        • Line chart of MaxTemp vs Date for one city
#        • Histogram of Rainfall
#        • Heatmap of correlations between Temp, Humidity & Pressure
#   4. Save all plots and summary statistics to one file (eda_summary.csv)
# ------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Read dataset
filename = "weather_aus.csv"   # make sure this file is in the same folder
data = pd.read_csv(filename)

# Step 2: Show summary statistics
print("----- Summary Statistics -----\n")
summary = data.describe(include='all')
print(summary)

# Step 3a: Line Chart of MaxTemp vs Date for one city
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

if 'Location' in data.columns and 'MaxTemp' in data.columns:
    city = input("\nEnter city name for line chart (e.g., Sydney): ")
    city_data = data[data['Location'].str.lower() == city.lower()]

    if not city_data.empty:
        plt.figure(figsize=(10, 5))
        plt.plot(city_data['Date'], city_data['MaxTemp'], color='tomato', marker='o', markersize=3)
        plt.title(f"Max Temperature vs Date ({city})")
        plt.xlabel("Date")
        plt.ylabel("Max Temperature (°C)")
        plt.grid(True, linestyle='--', alpha=0.6)
        plt.tight_layout()
        plt.savefig(f"linechart_maxtemp_{city}.png")
        plt.close()
    else:
        print(f"\n⚠ No data found for city '{city}'!")
else:
    print("\n⚠ Columns 'Location' or 'MaxTemp' not found in dataset!")

# Step 3b: Histogram of Rainfall
if 'Rainfall' in data.columns:
    plt.figure(figsize=(7, 5))
    plt.hist(data['Rainfall'].dropna(), bins=20, color='skyblue', edgecolor='black')
    plt.title('Histogram of Rainfall')
    plt.xlabel('Rainfall (mm)')
    plt.ylabel('Frequency')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.savefig("histogram_rainfall.png")
    plt.close()
else:
    print("\n⚠ Column 'Rainfall' not found in dataset!")

# Step 3c: Heatmap of Correlations (Temperature, Humidity & Pressure)
cols = ['MaxTemp', 'MinTemp', 'Humidity3pm', 'Pressure9am', 'Pressure3pm']
existing_cols = [c for c in cols if c in data.columns]

if len(existing_cols) >= 3:
    plt.figure(figsize=(8, 6))
    sns.heatmap(data[existing_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f", square=True)
    plt.title("Heatmap: Temperature, Humidity & Pressure Correlation")
    plt.tight_layout()
    plt.savefig("heatmap_weather_correlation.png")
    plt.close()
else:
    print("\n⚠ Not enough numeric columns found for correlation heatmap!")

# Step 4: Save summary + plot info in one CSV
summary_out = summary.copy()
summary_out.loc['Plot Files'] = [
    "linechart_maxtemp_<city>.png, histogram_rainfall.png, heatmap_weather_correlation.png"
] + [""] * (len(summary_out.columns) - 1)

summary_out.to_csv("eda_summary.csv")

# Step 5: Display completion message
print("\n✅ All plots and summary saved successfully!")
print(" - eda_summary.csv (includes summary + plot file names)")
print(" - linechart_maxtemp_<city>.png")
print(" - histogram_rainfall.png")
print(" - heatmap_weather_correlation.png")