In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
import os

# === Configuration ===
INPUT_FILE = "climate_action_data.csv"
OUTPUT_FILE = "cleaned_climate_action_data.csv"
PLOT_DIR = "plots"

# Ensure plot directory exists
os.makedirs(PLOT_DIR, exist_ok=True)


def load_dataset(file_path):
    try:
        df = pd.read_csv(file_path)
        print("[INFO] Dataset loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"[ERROR] File not found: {file_path}")
        sys.exit(1)


def clean_data(df):
    print("\n[INFO] Cleaning data...")

    df = df.drop_duplicates()
    df = df.replace('error', np.nan)
    df = df.dropna()

    # Convert 'Date' to datetime
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.dropna(subset=['Date'])

    # Convert numerical columns to float
    numerical_cols = [
        'Soil_Moisture(%)',
        'Soil_pH',
        'Temperature(C)',
        'Humidity(%)',
        'Fertilizer_Recommended(kg/ha)',
        'Irrigation_Recommended(mm)'
    ]
    for col in numerical_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna(subset=numerical_cols)

    print(f"[INFO] Data cleaned. Remaining records: {len(df)}")
    return df


def analyze_data(df):
    print("\n[INFO] Running analysis...")

    numerical_cols = [
        'Soil_Moisture(%)',
        'Soil_pH',
        'Temperature(C)',
        'Humidity(%)',
        'Fertilizer_Recommended(kg/ha)',
        'Irrigation_Recommended(mm)'
    ]

    # Descriptive stats
    print("\n[INFO] Descriptive Statistics:")
    print(df[numerical_cols].describe())

    # Histograms
    df[numerical_cols].hist(bins=15, figsize=(15, 10), color='skyblue', edgecolor='black')
    plt.suptitle("Histograms of Numerical Columns", fontsize=16)
    plt.tight_layout()
    plt.savefig(f"{PLOT_DIR}/histograms.png")
    plt.close()

    # Correlation heatmap
    plt.figure(figsize=(10, 8))
    corr = df[numerical_cols].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Correlation Heatmap of Numerical Variables")
    plt.tight_layout()
    plt.savefig(f"{PLOT_DIR}/correlation_heatmap.png")
    plt.close()

    # Fertilizer correlation
    fertilizer_corr = corr['Fertilizer_Recommended(kg/ha)'].sort_values(ascending=False)
    print("\n[INFO] Correlation with Fertilizer Recommended (kg/ha):")
    print(fertilizer_corr)

    # Crop with highest soil moisture
    if 'Crop_Type' in df.columns:
        avg_soil_moisture = df.groupby('Crop_Type')['Soil_Moisture(%)'].mean().sort_values(ascending=False)
        print("\n[INFO] Average Soil Moisture by Crop Type:")
        print(avg_soil_moisture)

        # Irrigation recommendations for high temperatures
        high_temp_df = df[df['Temperature(C)'] > 30]
        avg_irrigation_high_temp = high_temp_df.groupby('Crop_Type')['Irrigation_Recommended(mm)'].mean().sort_values(ascending=False)
        print("\n[INFO] Average Irrigation for High-Temperature Crops (>30°C):")
        print(avg_irrigation_high_temp)

        return fertilizer_corr, avg_soil_moisture, avg_irrigation_high_temp
    else:
        print("[WARNING] 'Crop_Type' column not found. Skipping crop-specific analysis.")
        return fertilizer_corr, None, None


def save_clean_data(df, file_path):
    df.to_csv(file_path, index=False)
    print(f"[INFO] Cleaned data exported to '{file_path}'.")


def summarize_recommendations(fertilizer_corr, avg_soil_moisture, avg_irrigation_high_temp):
    print("\n=== Summary of Recommendations ===")
    print("1️⃣ Adjust fertilizer usage based on Soil Moisture(%) and Soil pH due to strong correlations.")
    if avg_soil_moisture is not None:
        print(f"2️⃣ The crop with the highest average soil moisture is: {avg_soil_moisture.index[0]}")
    if avg_irrigation_high_temp is not None:
        print("3️⃣ For crops exposed to high temperatures (>30°C), consider increasing irrigation:")
        print(avg_irrigation_high_temp.head())
    print(" Analysis complete.")


def main():
    print("🚀Climate Action Data Analysis Started")

    df = load_dataset(INPUT_FILE)
    cleaned_df = clean_data(df)
    fertilizer_corr, avg_soil_moisture, avg_irrigation_high_temp = analyze_data(cleaned_df)
    save_clean_data(cleaned_df, OUTPUT_FILE)
    summarize_recommendations(fertilizer_corr, avg_soil_moisture, avg_irrigation_high_temp)


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'matplotlib'