For some Experiments

In [None]:
# Import Libraries

# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
plt.style.use("ggplot")
sns.set_theme(style="whitegrid")

print("Libraries imported successfully")

In [None]:
# Load Data Set 

# Load dataset (adjust path if needed)
df = pd.read_csv("../data/raw/SriLanka_Weather_Dataset_V1.csv")

# Show first 5 rows
df.head()

In [None]:
# Basic Data Set Overview

print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns)

print("\nData Information:")
df.info()

In [None]:
# Statistical Summary

df.describe()

In [None]:
# Check missing vallues

missing_values = df.isnull().sum()
missing_values

In [None]:
# Visualizations

plt.figure(figsize=(8,4))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Rainfall Distribution 

plt.figure(figsize=(8,5))
plt.hist(df['rain_sum'], bins=20)
plt.title("Distribution of Monthly Rainfall")
plt.xlabel("Rainfall (mm)")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Rainfall time series plot

# Convert date column if exists
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    
    plt.figure(figsize=(12,5))
    plt.plot(df['date'], df['rain_sum'])
    plt.title("Rainfall Over Time")
    plt.xlabel("Date")
    plt.ylabel("Rainfall (mm)")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("No 'date' column found. Skipping time series plot.")

In [None]:
# Correlation Matrix

plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Previous Month Rainfall Effect (Lag Analysis)

# Create lag feature
df['rain_lag1'] = df['rain_sum'].shift(1)

# Drop NaN values caused by shifting
df_lag = df.dropna()

# Scatter plot
plt.figure(figsize=(6,5))
plt.scatter(df_lag['rain_lag1'], df_lag['rain_sum'])
plt.title("Previous Month vs Current Month Rainfall")
plt.xlabel("Previous Month Rainfall (mm)")
plt.ylabel("Current Month Rainfall (mm)")
plt.show()

# Correlation value
print("Correlation between previous and current rainfall:")
print(df_lag[['rain_lag1', 'rain_sum']].corr())