# Rainfall Data Exploration

This notebook explores the weather data for rainfall prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add parent directory to path
sys.path.append('..')
from config import Config

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load Data

In [None]:
# Load all CSV files from raw data directory
data_files = []
for filename in os.listdir(Config.RAW_DATA_DIR):
    if filename.endswith('.csv'):
        filepath = os.path.join(Config.RAW_DATA_DIR, filename)
        df = pd.read_csv(filepath)
        data_files.append(df)

if data_files:
    df = pd.concat(data_files, ignore_index=True)
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
else:
    print("No data files found. Please run fetch_data.py first.")

## Data Overview

In [None]:
# Basic info
if 'df' in locals():
    print("Dataset Info:")
    df.info()
    
    print("\nFirst few rows:")
    display(df.head())
    
    print("\nStatistical summary:")
    display(df.describe())

## Rainfall Distribution

In [None]:
if 'df' in locals():
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Rainfall distribution
    axes[0,0].hist(df['precipitation_sum'], bins=50, alpha=0.7)
    axes[0,0].set_title('Rainfall Distribution')
    axes[0,0].set_xlabel('Precipitation (mm)')
    axes[0,0].set_ylabel('Frequency')
    
    # Rainfall by location
    df.boxplot(column='precipitation_sum', by='location', ax=axes[0,1])
    axes[0,1].set_title('Rainfall by Location')
    
    # Rainfall by season
    df.boxplot(column='precipitation_sum', by='season', ax=axes[1,0])
    axes[1,0].set_title('Rainfall by Season')
    
    # Monthly rainfall
    monthly_rain = df.groupby('month')['precipitation_sum'].mean()
    axes[1,1].bar(monthly_rain.index, monthly_rain.values)
    axes[1,1].set_title('Average Monthly Rainfall')
    axes[1,1].set_xlabel('Month')
    axes[1,1].set_ylabel('Average Precipitation (mm)')
    
    plt.tight_layout()
    plt.show()

## Correlation Analysis

In [None]:
if 'df' in locals():
    # Select numeric columns for correlation
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Weather Variables Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Correlation with rainfall
    rainfall_corr = correlation_matrix['precipitation_sum'].sort_values(ascending=False)
    print("\nCorrelation with Rainfall:")
    print(rainfall_corr)