In [None]:
# Imports and configuration
import os, sqlite3, warnings
from datetime import timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # type: ignore
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Paths (change if needed)
CSV_PATH = r"C:\Users\hp\OneDrive\Desktop\Weather Prediction Model\weather_data.csv"   # <- change this if your CSV is elsewhere
OUTDIR = "/mnt/data"
DB_PATH = os.path.join(OUTDIR, "weather_weather.db")
PRED_CSV_PATH = os.path.join(OUTDIR, "weather_predictions.csv")

print('CSV_PATH =', CSV_PATH)
print('Outputs will be saved to:', OUTDIR)

In [None]:
# Load CSV and basic checks
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f'CSV not found at {CSV_PATH}. Upload your CSV or change CSV_PATH.')

df = pd.read_csv(CSV_PATH)
print('Loaded CSV with shape:', df.shape)
display(df.head())

# helper to detect columns
def find_date_column(df):
    for col in df.columns:
        if col.lower() in ('date','datetime','day','timestamp'):
            return col
    for col in df.columns:
        try:
            pd.to_datetime(df[col])
            return col
        except Exception:
            continue
    return None

def find_temp_and_rain_columns(df):
    temp_col = None; rain_col = None
    for col in df.columns:
        lname = col.lower()
        if temp_col is None and ('temp' in lname or 'temperature' in lname):
            temp_col = col
        if rain_col is None and ('rain' in lname or 'precip' in lname):
            rain_col = col
    return temp_col, rain_col

date_col = find_date_column(df)
if date_col is not None:
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df = df.sort_values(by=date_col).reset_index(drop=True)
else:
    df = df.reset_index().rename(columns={'index':'synthetic_index'})
    date_col = 'synthetic_index'
    df[date_col] = pd.to_datetime(df[date_col], unit='D', origin='1970-01-01')

temp_col, rain_col = find_temp_and_rain_columns(df)
if temp_col is None or rain_col is None:
    raise ValueError('Could not detect temp or rain columns. Ensure names include "temp" and "rain"/"precip".\nFound columns: ' + ', '.join(df.columns))

df[temp_col] = pd.to_numeric(df[temp_col], errors='coerce')
df[rain_col] = pd.to_numeric(df[rain_col], errors='coerce')
df = df.dropna(subset=[temp_col, rain_col], how='all').reset_index(drop=True)

print('Using date_col =', date_col, ', temp_col =', temp_col, ', rain_col =', rain_col)
print('\nLast 5 rows:')
display(df.tail())

# Save raw table to SQLite DB (replace)
os.makedirs(OUTDIR, exist_ok=True)
conn = sqlite3.connect(DB_PATH)
df_for_db = df.copy()
if 'predicted' not in df_for_db.columns:
    df_for_db['predicted'] = 0
df_for_db.to_sql('weather', conn, if_exists='replace', index=False)
conn.commit(); conn.close()
print('Saved raw data to DB at', DB_PATH)

In [None]:
# Pattern analysis (last 7 days) and feature engineering
last7 = df.tail(7).copy()
print('Pattern analysis (last 7 days):')
temps = last7[temp_col].values
rains = last7[rain_col].values
print(' - temp_mean:', float(np.nanmean(temps)))
print(' - temp_median:', float(np.nanmedian(temps)))
print(' - temp_std:', float(np.nanstd(temps)))
print(' - temp_trend:', 'increasing' if temps[-1] > temps[0] else ('decreasing' if temps[-1] < temps[0] else 'flat'))
print(' - rain_total:', float(np.nansum(rains)))
print(' - rain_days:', int(np.sum(~np.isnan(rains) & (rains>0))))

# plots for last 7 days
plt.figure(figsize=(8,3)); plt.plot(last7[date_col], last7[temp_col], marker='o'); plt.title('Temperature - last 7 days'); plt.xlabel('Date'); plt.ylabel('Temperature'); plt.tight_layout(); plt.show()
plt.figure(figsize=(8,3)); plt.plot(last7[date_col], last7[rain_col], marker='o'); plt.title('Rainfall - last 7 days'); plt.xlabel('Date'); plt.ylabel('Rainfall'); plt.tight_layout(); plt.show()

# Feature engineering: lags and rolling features
nlags = 3
df_feat = df[[date_col, temp_col, rain_col]].copy().set_index(date_col)
for lag in range(1, nlags+1):
    df_feat[f'temp_lag_{lag}'] = df_feat[temp_col].shift(lag)
    df_feat[f'rain_lag_{lag}'] = df_feat[rain_col].shift(lag)
df_feat['temp_roll_3'] = df_feat[temp_col].rolling(window=3, min_periods=1).mean().shift(1)
df_feat['rain_roll_3'] = df_feat[rain_col].rolling(window=3, min_periods=1).mean().shift(1)

df_clean = df_feat.dropna().copy()
print('\nAfter feature creation, rows available for modelling:', len(df_clean))
display(df_clean.head())

In [None]:
# Modeling and evaluation (Temperature and Rainfall separately)
feature_cols = [c for c in df_clean.columns if c not in (temp_col, rain_col)]
X = df_clean[feature_cols].values
y_temp = df_clean[temp_col].values
y_rain = df_clean[rain_col].values

def evaluate_models(X, y, n_splits=3):
    tscv = TimeSeriesSplit(n_splits=max(1, min(n_splits, len(X)-1)))
    models = {
        'LinearRegression': Pipeline([('lr', LinearRegression())]),
        'DecisionTree': Pipeline([('dt', DecisionTreeRegressor(random_state=42))]),
        'SVR': Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
    }
    results = {}
    for name, model in models.items():
        maes, rmses, r2s = [], [], []
        splits = list(tscv.split(X)) if len(X) > 1 else []
        if len(splits) == 0:
            if len(X) < 2:
                maes.append(np.nan); rmses.append(np.nan); r2s.append(np.nan)
            else:
                train_idx = np.arange(max(1, len(X)-1)); test_idx = np.array([len(X)-1]); splits = [(train_idx, test_idx)]
        for train_idx, test_idx in splits:
            Xtr, Xte = X[train_idx], X[test_idx]
            ytr, yte = y[train_idx], y[test_idx]
            try:
                model.fit(Xtr, ytr)
                ypred = model.predict(Xte)
                maes.append(mean_absolute_error(yte, ypred))
                rmses.append(mean_squared_error(yte, ypred, squared=False))
                r2s.append(r2_score(yte, ypred) if len(yte)>1 else np.nan)
            except Exception as e:
                maes.append(np.nan); rmses.append(np.nan); r2s.append(np.nan)
        results[name] = {'mae_mean': float(np.nanmean(maes)), 'rmse_mean': float(np.nanmean(rmses)), 'r2_mean': float(np.nanmean(r2s)), 'model': model}
    return results

print('Training & evaluating models for Temperature...')
temp_results = evaluate_models(X, y_temp, n_splits=3)
for name, res in temp_results.items():
    print(f" - {name}: MAE={res['mae_mean']:.4f}, RMSE={res['rmse_mean']:.4f}, R2={res['r2_mean']:.4f}")

print('\nTraining & evaluating models for Rainfall...')
rain_results = evaluate_models(X, y_rain, n_splits=3)
for name, res in rain_results.items():
    print(f" - {name}: MAE={res['mae_mean']:.4f}, RMSE={res['rmse_mean']:.4f}, R2={res['r2_mean']:.4f}")

# Choose best by RMSE
best_temp_name = min(temp_results.keys(), key=lambda k: temp_results[k]['rmse_mean'] if not np.isnan(temp_results[k]['rmse_mean']) else 1e9)
best_rain_name = min(rain_results.keys(), key=lambda k: rain_results[k]['rmse_mean'] if not np.isnan(rain_results[k]['rmse_mean']) else 1e9)
best_temp_model = temp_results[best_temp_name]['model']
best_rain_model = rain_results[best_rain_name]['model']

# Fit on entire cleaned data
best_temp_model.fit(X, y_temp)
best_rain_model.fit(X, y_rain)
print('\nSelected models -> Temperature:', best_temp_name, ', Rainfall:', best_rain_name)

In [None]:
# Next-day (8th day) prediction and save outputs
# Prepare next-day features from most recent values
last_index = df_feat.index.max()
next_features = {}
for lag in range(1, nlags+1):
    next_features[f'temp_lag_{lag}'] = df_feat[temp_col].iloc[-lag] if len(df_feat) >= lag else np.nan
    next_features[f'rain_lag_{lag}'] = df_feat[rain_col].iloc[-lag] if len(df_feat) >= lag else np.nan
next_features['temp_roll_3'] = df_feat[temp_col].tail(3).mean() if len(df_feat) >= 1 else np.nan
next_features['rain_roll_3'] = df_feat[rain_col].tail(3).mean() if len(df_feat) >= 1 else np.nan

X_next = np.array([next_features[c] for c in feature_cols]).reshape(1, -1)
temp_pred = best_temp_model.predict(X_next)[0]
rain_pred = best_rain_model.predict(X_next)[0]

# compute predicted date (assume daily)
try:
    if isinstance(last_index, pd.Timestamp):
        if len(df_feat.index) >= 2:
            delta = df_feat.index[-1] - df_feat.index[-2]
            next_date = df_feat.index[-1] + delta
        else:
            next_date = df_feat.index[-1] + timedelta(days=1)
    else:
        next_date = pd.to_datetime(df_feat.index.max()) + timedelta(days=1)
except Exception:
    next_date = pd.to_datetime(df_feat.index.max()) + timedelta(days=1)

pred_row = {date_col: next_date, temp_col: float(temp_pred), rain_col: float(rain_pred)}
pred_row['predicted'] = 1
pred_row['model_temp'] = best_temp_name
pred_row['model_rain'] = best_rain_name
pred_row.update(next_features)

pred_df = pd.DataFrame([pred_row])
print('\nNext-day prediction (8th day):')
display(pred_df.rename(columns={date_col: 'date', temp_col: 'temperature', rain_col: 'rainfall'}))

# Append prediction to DB
conn = sqlite3.connect(DB_PATH)
df_for_db = pd.read_sql_query('SELECT * FROM weather LIMIT 1;', conn)  # just to get schema
pred_to_write = pred_df.copy()
# ensure all columns exist
for col in df_for_db.columns:
    if col not in pred_to_write.columns:
        pred_to_write[col] = np.nan
pred_to_write = pred_to_write[df_for_db.columns]
pred_to_write.to_sql('weather', conn, if_exists='append', index=False)
conn.commit(); conn.close()
print('Appended prediction to DB at', DB_PATH)

# Save prediction CSV
out_pred = pred_df.rename(columns={date_col: 'date', temp_col: 'temperature', rain_col: 'rainfall'})
out_pred.to_csv(PRED_CSV_PATH, index=False)
print('Saved prediction CSV to', PRED_CSV_PATH)

# Plots: actual vs predicted for last part
try:
    y_temp_all = best_temp_model.predict(X)
    y_rain_all = best_rain_model.predict(X)
    nplot = min(30, len(df_clean))
    idx = df_clean.index[-nplot:]
    plt.figure(figsize=(10,3))
    plt.plot(idx, df_clean[temp_col].iloc[-nplot:], label='actual')
    plt.plot(idx, y_temp_all[-nplot:], linestyle='--', label='predicted')
    plt.title('Temperature: Actual vs Predicted (last points)'); plt.legend(); plt.tight_layout(); plt.show()

    plt.figure(figsize=(10,3))
    plt.plot(idx, df_clean[rain_col].iloc[-nplot:], label='actual')
    plt.plot(idx, y_rain_all[-nplot:], linestyle='--', label='predicted')
    plt.title('Rainfall: Actual vs Predicted (last points)'); plt.legend(); plt.tight_layout(); plt.show()
except Exception as e:
    print('Plotting failed:', e)

print('\nNotebook run complete. Outputs:') 
print(' - Prediction CSV:', PRED_CSV_PATH)
print(' - SQLite DB:', DB_PATH)

In [None]:
# Cell 1: Import additional libraries for EDA
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

print("=== EXPLORATORY DATA ANALYSIS ===")

In [None]:
# Cell 2: Basic data overview and statistics
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nBasic Info:")
print(df.info())
print("\nDescriptive Statistics:")
display(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Cell 3: Rainfall distribution analysis
plt.figure(figsize=(15, 5))

# Pie chart for rainfall distribution
plt.subplot(1, 3, 1)
if rain_col in df.columns:
    rainfall_counts = df[rain_col].value_counts()
    # Convert to binary (rain/no rain) if continuous
    if rainfall_counts.shape[0] > 10:  # If continuous, create binary
        df_rain_binary = (df[rain_col] > 0).astype(int)
        rainfall_binary_counts = df_rain_binary.value_counts()
        plt.pie(rainfall_binary_counts.values, 
                labels=['No Rain', 'Rain'], 
                autopct='%1.1f%%', 
                colors=['skyblue', 'lightcoral'])
        plt.title('Rainfall Distribution (Binary)')
    else:
        plt.pie(rainfall_counts.values, 
                labels=rainfall_counts.index, 
                autopct='%1.1f%%')
        plt.title('Rainfall Distribution')
else:
    plt.text(0.5, 0.5, 'Rain column not found', ha='center', va='center')
    plt.title('Rainfall Distribution - Data Not Available')

# Rainfall vs temperature relationship
plt.subplot(1, 3, 2)
if rain_col in df.columns and temp_col in df.columns:
    # Create binary rain for scatter plot
    df_eda = df.copy()
    df_eda['rain_binary'] = (df_eda[rain_col] > 0).astype(int)
    sns.scatterplot(data=df_eda, x=temp_col, y=rain_col, hue='rain_binary', alpha=0.6)
    plt.title('Temperature vs Rainfall')
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center')
    plt.title('Temperature vs Rainfall')

# Time series of rainfall
plt.subplot(1, 3, 3)
if rain_col in df.columns and date_col in df.columns:
    plt.plot(df[date_col], df[rain_col], alpha=0.7)
    plt.title('Rainfall Over Time')
    plt.xticks(rotation=45)
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center')
    plt.title('Rainfall Over Time')

plt.tight_layout()
plt.show()

In [None]:
# Cell 4: Grouped analysis by rainfall
print("Grouped Analysis by Rainfall (Binary):")

# Create binary rainfall column for analysis
df_eda = df.copy()
if rain_col in df.columns:
    df_eda['rain_binary'] = (df_eda[rain_col] > 0).astype(int)
    
    # Display grouped statistics
    numeric_cols = df_eda.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col != 'rain_binary']
    
    if len(numeric_cols) > 0:
        grouped_stats = df_eda.groupby('rain_binary')[numeric_cols].mean()
        print("\nMean values by rainfall status:")
        display(grouped_stats)
        
        # Key observations
        print("\nKey Observations:")
        if temp_col in numeric_cols:
            temp_diff = grouped_stats[temp_col].diff().iloc[-1]
            print(f"- Temperature difference (Rain vs No Rain): {temp_diff:.2f}")
            
        # Check for humidity column
        humidity_cols = [col for col in df.columns if 'humid' in col.lower()]
        if humidity_cols and humidity_cols[0] in numeric_cols:
            humid_col = humidity_cols[0]
            humid_diff = grouped_stats[humid_col].diff().iloc[-1]
            print(f"- Humidity difference (Rain vs No Rain): {humid_diff:.2f}")
            
        # Check for cloud column
        cloud_cols = [col for col in df.columns if 'cloud' in col.lower()]
        if cloud_cols and cloud_cols[0] in numeric_cols:
            cloud_col = cloud_cols[0]
            cloud_diff = grouped_stats[cloud_col].diff().iloc[-1]
            print(f"- Cloud cover difference (Rain vs No Rain): {cloud_diff:.2f}")
            
        # Check for wind speed
        wind_cols = [col for col in df.columns if 'wind' in col.lower() and 'speed' in col.lower()]
        if wind_cols and wind_cols[0] in numeric_cols:
            wind_col = wind_cols[0]
            wind_diff = grouped_stats[wind_col].diff().iloc[-1]
            print(f"- Wind speed difference (Rain vs No Rain): {wind_diff:.2f}")
else:
    print("Rain column not available for grouped analysis")

In [None]:
# Cell 5: Distribution plots for all numeric features
print("Distribution of Numeric Features:")

# Get numeric columns (excluding date and target columns)
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove date column if it was converted to numeric
if date_col in numeric_features:
    numeric_features.remove(date_col)
# Remove target columns from features list
if temp_col in numeric_features:
    numeric_features.remove(temp_col)
if rain_col in numeric_features:
    numeric_features.remove(rain_col)

print(f"Numeric features to analyze: {numeric_features}")

if numeric_features:
    # Calculate grid dimensions
    n_features = len(numeric_features)
    n_cols = 4
    n_rows = (n_features + n_cols - 1) // n_cols
    
    plt.figure(figsize=(15, 4*n_rows))
    
    for i, feature in enumerate(numeric_features, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(df[feature], kde=True, bins=30)
        plt.title(f'Distribution of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
else:
    print("No numeric features found for distribution analysis")

In [None]:
# Cell 6: Box plots for outlier detection
print("Box Plots for Outlier Detection:")

if numeric_features:
    plt.figure(figsize=(15, 4*n_rows))
    
    for i, feature in enumerate(numeric_features, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.boxplot(y=df[feature])
        plt.title(f'Box Plot of {feature}')
        plt.ylabel(feature)
    
    plt.tight_layout()
    plt.show()
    
    # Outlier analysis
    print("\nOutlier Analysis (using IQR method):")
    for feature in numeric_features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        print(f"{feature}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.1f}%)")
else:
    print("No numeric features found for box plot analysis")

In [None]:
# Cell 7: Correlation analysis
print("Correlation Analysis:")

# Prepare data for correlation (include all numeric columns)
corr_columns = [temp_col, rain_col] + numeric_features
corr_columns = [col for col in corr_columns if col in df.columns]

if len(corr_columns) > 1:
    # Calculate correlation matrix
    correlation_matrix = df[corr_columns].corr()
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle
    sns.heatmap(correlation_matrix, 
                mask=mask,
                annot=True, 
                cmap='coolwarm', 
                center=0,
                square=True,
                fmt='.2f',
                cbar_kws={'shrink': 0.8})
    plt.title('Feature Correlation Heatmap')
    plt.tight_layout()
    plt.show()
    
    # Identify highly correlated features
    print("\nHighly Correlated Features (|r| > 0.8):")
    high_corr_pairs = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.8:
                high_corr_pairs.append((
                    correlation_matrix.columns[i],
                    correlation_matrix.columns[j],
                    correlation_matrix.iloc[i, j]
                ))
    
    if high_corr_pairs:
        for feature1, feature2, corr_value in high_corr_pairs:
            print(f"- {feature1} vs {feature2}: {corr_value:.3f}")
    else:
        print("No highly correlated feature pairs found (|r| > 0.8)")
else:
    print("Not enough numeric columns for correlation analysis")

In [None]:
# Cell 8: Time series analysis
print("Time Series Analysis:")

if date_col in df.columns:
    # Set date as index for time series plotting
    df_time = df.set_index(date_col)
    
    # Plot temperature and rainfall over time
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8))
    
    if temp_col in df.columns:
        ax1.plot(df_time.index, df_time[temp_col], color='red', alpha=0.7)
        ax1.set_ylabel('Temperature')
        ax1.set_title('Temperature Over Time')
        ax1.grid(True, alpha=0.3)
    
    if rain_col in df.columns:
        ax2.plot(df_time.index, df_time[rain_col], color='blue', alpha=0.7)
        ax2.set_ylabel('Rainfall')
        ax2.set_title('Rainfall Over Time')
        ax2.grid(True, alpha=0.3)
        ax2.set_xlabel('Date')
    
    plt.tight_layout()
    plt.show()
    
    # Seasonal patterns (if data spans multiple years)
    if hasattr(df_time.index, 'month'):
        print("\nMonthly Patterns:")
        monthly_analysis = df_time.groupby(df_time.index.month)[
    df_time.select_dtypes(include='number').columns
].mean()
        monthly_analysis.index = monthly_analysis.index.map({
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
})

        if temp_col in monthly_analysis.columns:
            plt.figure(figsize=(10, 4))
            plt.subplot(1, 2, 1)
            plt.plot(monthly_analysis.index, monthly_analysis[temp_col], marker='o')
            plt.title('Average Temperature by Month')
            plt.xlabel('Month')
            plt.ylabel('Temperature')
            plt.grid(True, alpha=0.3)
        
        if rain_col in monthly_analysis.columns:
            plt.subplot(1, 2, 2)
            plt.plot(monthly_analysis.index, monthly_analysis[rain_col], marker='o', color='green')
            plt.title('Average Rainfall by Month')
            plt.xlabel('Month')
            plt.ylabel('Rainfall')
            plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
else:
    print("Date column not available for time series analysis")

In [None]:
# Cell 9: Data quality assessment
print("Data Quality Assessment:")

# Check for skewness
print("\nSkewness of Numeric Features:")
for feature in numeric_features:
    if feature in df.columns:
        skewness = df[feature].skew()
        print(f"{feature}: {skewness:.3f} ({'Highly skewed' if abs(skewness) > 1 else 'Moderate' if abs(skewness) > 0.5 else 'Fairly symmetric'})")

# Check for constant columns
constant_cols = []
for col in df.columns:
    if df[col].nunique() <= 1:
        constant_cols.append(col)

if constant_cols:
    print(f"\nConstant columns (may consider removing): {constant_cols}")
else:
    print("\nNo constant columns found")

print("\nEDA Complete! Proceeding to modeling...")

In [None]:
whos

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
# Enhanced model evaluation function
def comprehensive_evaluate_models(X, y, target_name, n_splits=3):
    """
    Comprehensive evaluation of models with detailed metrics and visualization
    """
    tscv = TimeSeriesSplit(n_splits=max(1, min(n_splits, len(X)-1)))
    models = {
        'LinearRegression': Pipeline([('lr', LinearRegression())]),
        'DecisionTree': Pipeline([('dt', DecisionTreeRegressor(random_state=42))]),
        'SVR': Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
    }
    
    results = {}
    all_predictions = {}
    
    for name, model in models.items():
        maes, rmses, r2s, mapes = [], [], [], []
        splits = list(tscv.split(X)) if len(X) > 1 else []
        
        if len(splits) == 0:
            if len(X) < 2:
                maes.append(np.nan); rmses.append(np.nan); r2s.append(np.nan); mapes.append(np.nan)
            else:
                train_idx = np.arange(max(1, len(X)-1)); test_idx = np.array([len(X)-1])
                splits = [(train_idx, test_idx)]
        
        fold_predictions = []
        
        for fold, (train_idx, test_idx) in enumerate(splits):
            Xtr, Xte = X[train_idx], X[test_idx]
            ytr, yte = y[train_idx], y[test_idx]
            
            try:
                model.fit(Xtr, ytr)
                ypred = model.predict(Xte)
                
                # Calculate metrics
                mae = mean_absolute_error(yte, ypred)
                rmse = mean_squared_error(yte, ypred, squared=False)
                r2 = r2_score(yte, ypred) if len(yte) > 1 else np.nan
                mape = np.mean(np.abs((yte - ypred) / yte)) * 100 if np.all(yte != 0) else np.nan
                
                maes.append(mae)
                rmses.append(rmse)
                r2s.append(r2)
                mapes.append(mape)
                
                # Store predictions for this fold
                for true, pred in zip(yte, ypred):
                    fold_predictions.append({
                        'fold': fold,
                        'actual': true,
                        'predicted': pred,
                        'error': true - pred
                    })
                    
            except Exception as e:
                print(f"Error in {name}, fold {fold}: {e}")
                maes.append(np.nan); rmses.append(np.nan); r2s.append(np.nan); mapes.append(np.nan)
        
        results[name] = {
            'mae_mean': float(np.nanmean(maes)),
            'mae_std': float(np.nanstd(maes)),
            'rmse_mean': float(np.nanmean(rmses)),
            'rmse_std': float(np.nanstd(rmses)),
            'r2_mean': float(np.nanmean(r2s)),
            'r2_std': float(np.nanstd(r2s)),
            'mape_mean': float(np.nanmean(mapes)),
            'model': model
        }
        all_predictions[name] = fold_predictions
    
    return results, all_predictions

# FIRST: Run the comprehensive evaluation to get results AND predictions
print("Comprehensive evaluation for Temperature...")
temp_results, temp_predictions = comprehensive_evaluate_models(X, y_temp, "Temperature")
for name, res in temp_results.items():
    print(f" - {name}: MAE={res['mae_mean']:.4f}±{res['mae_std']:.4f}, "
          f"RMSE={res['rmse_mean']:.4f}±{res['rmse_std']:.4f}, "
          f"R²={res['r2_mean']:.4f}±{res['r2_std']:.4f}, "
          f"MAPE={res['mape_mean']:.2f}%")

print("\nComprehensive evaluation for Rainfall...")
rain_results, rain_predictions = comprehensive_evaluate_models(X, y_rain, "Rainfall")
for name, res in rain_results.items():
    print(f" - {name}: MAE={res['mae_mean']:.4f}±{res['mae_std']:.4f}, "
          f"RMSE={res['rmse_mean']:.4f}±{res['rmse_std']:.4f}, "
          f"R²={res['r2_mean']:.4f}±{res['r2_std']:.4f}, "
          f"MAPE={res['mape_mean']:.2f}%")

In [None]:
# FIXED MODEL EVALUATION WITH PLOTS
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set up matplotlib for Jupyter
%matplotlib inline

print("=== FIXED MODEL EVALUATION ===")

# 1. Basic Model Performance Table
print("\nMODEL PERFORMANCE SUMMARY:")
print("-" * 70)
print(f"{'Model':<15} {'Target':<12} {'MAE':<8} {'RMSE':<8} {'R²':<8}")
print("-" * 70)

# Evaluate each model simply
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'SVR': Pipeline([('scaler', StandardScaler()), ('svr', SVR())])
}

for model_name, model in models.items():
    # Temperature
    model.fit(X, y_temp)
    temp_pred = model.predict(X)
    temp_mae = mean_absolute_error(y_temp, temp_pred)
    temp_rmse = np.sqrt(mean_squared_error(y_temp, temp_pred))  # FIXED: Manual RMSE calculation
    temp_r2 = r2_score(y_temp, temp_pred)
    
    # Rainfall  
    model.fit(X, y_rain)
    rain_pred = model.predict(X)
    rain_mae = mean_absolute_error(y_rain, rain_pred)
    rain_rmse = np.sqrt(mean_squared_error(y_rain, rain_pred))  # FIXED: Manual RMSE calculation
    rain_r2 = r2_score(y_rain, rain_pred)
    
    print(f"{model_name:<15} {'Temperature':<12} {temp_mae:<8.3f} {temp_rmse:<8.3f} {temp_r2:<8.3f}")
    print(f"{model_name:<15} {'Rainfall':<12} {rain_mae:<8.3f} {rain_rmse:<8.3f} {rain_r2:<8.3f}")

# 2. Simple Actual vs Predicted Plots
print("\nCREATING BASIC PLOTS...")

# Temperature predictions
best_temp_model = LinearRegression()
best_temp_model.fit(X, y_temp)
temp_predictions = best_temp_model.predict(X)

# Rainfall predictions
best_rain_model = LinearRegression() 
best_rain_model.fit(X, y_rain)
rain_predictions = best_rain_model.predict(X)

# Create the plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Temperature - Actual vs Predicted
ax1.scatter(y_temp, temp_predictions, alpha=0.7, color='red')
ax1.plot([y_temp.min(), y_temp.max()], [y_temp.min(), y_temp.max()], 'k--', lw=2)
ax1.set_xlabel('Actual Temperature (°C)')
ax1.set_ylabel('Predicted Temperature (°C)')
ax1.set_title('Temperature: Actual vs Predicted')
ax1.grid(True, alpha=0.3)

# Add metrics to plot
temp_r2 = r2_score(y_temp, temp_predictions)
temp_mae = mean_absolute_error(y_temp, temp_predictions)
ax1.text(0.05, 0.95, f'R² = {temp_r2:.3f}\nMAE = {temp_mae:.3f}', 
         transform=ax1.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Temperature - Residuals
temp_residuals = y_temp - temp_predictions
ax2.scatter(temp_predictions, temp_residuals, alpha=0.7, color='red')
ax2.axhline(y=0, color='k', linestyle='--')
ax2.set_xlabel('Predicted Temperature (°C)')
ax2.set_ylabel('Residuals')
ax2.set_title('Temperature: Residual Plot')
ax2.grid(True, alpha=0.3)

# Add residual stats
ax2.text(0.05, 0.95, f'Mean Residual: {np.mean(temp_residuals):.3f}', 
         transform=ax2.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Rainfall - Actual vs Predicted
ax3.scatter(y_rain, rain_predictions, alpha=0.7, color='blue')
ax3.plot([y_rain.min(), y_rain.max()], [y_rain.min(), y_rain.max()], 'k--', lw=2)
ax3.set_xlabel('Actual Rainfall (mm)')
ax3.set_ylabel('Predicted Rainfall (mm)')
ax3.set_title('Rainfall: Actual vs Predicted')
ax3.grid(True, alpha=0.3)

# Add metrics to plot
rain_r2 = r2_score(y_rain, rain_predictions)
rain_mae = mean_absolute_error(y_rain, rain_predictions)
ax3.text(0.05, 0.95, f'R² = {rain_r2:.3f}\nMAE = {rain_mae:.3f}', 
         transform=ax3.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

# Rainfall - Residuals
rain_residuals = y_rain - rain_predictions
ax4.scatter(rain_predictions, rain_residuals, alpha=0.7, color='blue')
ax4.axhline(y=0, color='k', linestyle='--')
ax4.set_xlabel('Predicted Rainfall (mm)')
ax4.set_ylabel('Residuals')
ax4.set_title('Rainfall: Residual Plot')
ax4.grid(True, alpha=0.3)

# Add residual stats
ax4.text(0.05, 0.95, f'Mean Residual: {np.mean(rain_residuals):.3f}', 
         transform=ax4.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))

plt.tight_layout()
plt.show()

print("Basic plots created successfully!")

# 3. Model Comparison Bar Chart
print("\nCREATING MODEL COMPARISON CHART...")

models_mae_temp = []
models_mae_rain = []
models_rmse_temp = []
models_rmse_rain = []
model_names = []

for model_name, model in models.items():
    # Temperature metrics
    model.fit(X, y_temp)
    temp_pred = model.predict(X)
    temp_mae = mean_absolute_error(y_temp, temp_pred)
    temp_rmse = np.sqrt(mean_squared_error(y_temp, temp_pred))
    models_mae_temp.append(temp_mae)
    models_rmse_temp.append(temp_rmse)
    
    # Rainfall metrics
    model.fit(X, y_rain) 
    rain_pred = model.predict(X)
    rain_mae = mean_absolute_error(y_rain, rain_pred)
    rain_rmse = np.sqrt(mean_squared_error(y_rain, rain_pred))
    models_mae_rain.append(rain_mae)
    models_rmse_rain.append(rain_rmse)
    
    model_names.append(model_name)

# Create comparison plot
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Temperature MAE comparison
bars1 = ax1.bar(model_names, models_mae_temp, color=['red', 'orange', 'green'], alpha=0.7)
ax1.set_ylabel('MAE (Temperature °C)')
ax1.set_title('Model Comparison - Temperature MAE')
ax1.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}', ha='center', va='bottom')

# Temperature RMSE comparison  
bars2 = ax2.bar(model_names, models_rmse_temp, color=['red', 'orange', 'green'], alpha=0.7)
ax2.set_ylabel('RMSE (Temperature °C)')
ax2.set_title('Model Comparison - Temperature RMSE')
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}', ha='center', va='bottom')

# Rainfall MAE comparison  
bars3 = ax3.bar(model_names, models_mae_rain, color=['red', 'orange', 'green'], alpha=0.7)
ax3.set_ylabel('MAE (Rainfall mm)')
ax3.set_title('Model Comparison - Rainfall MAE')
ax3.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars3:
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}', ha='center', va='bottom')

# Rainfall RMSE comparison  
bars4 = ax4.bar(model_names, models_rmse_rain, color=['red', 'orange', 'green'], alpha=0.7)
ax4.set_ylabel('RMSE (Rainfall mm)')
ax4.set_title('Model Comparison - Rainfall RMSE')
ax4.grid(True, alpha=0.3)

# Add value labels on bars
for bar in bars4:
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Model comparison chart created successfully!")

# 4. Final Summary
print("\n" + "="*60)
print("FINAL RECOMMENDATION")
print("="*60)

best_temp_mae_idx = np.argmin(models_mae_temp)
best_rain_mae_idx = np.argmin(models_mae_rain)
best_temp_rmse_idx = np.argmin(models_rmse_temp)
best_rain_rmse_idx = np.argmin(models_rmse_rain)

print(f"Best model for Temperature (MAE): {model_names[best_temp_mae_idx]} (MAE: {models_mae_temp[best_temp_mae_idx]:.3f}°C)")
print(f"Best model for Rainfall (MAE): {model_names[best_rain_mae_idx]} (MAE: {models_mae_rain[best_rain_mae_idx]:.3f}mm)")
print(f"Best model for Temperature (RMSE): {model_names[best_temp_rmse_idx]} (RMSE: {models_rmse_temp[best_temp_rmse_idx]:.3f}°C)")
print(f"Best model for Rainfall (RMSE): {model_names[best_rain_rmse_idx]} (RMSE: {models_rmse_rain[best_rain_rmse_idx]:.3f}mm)")

print("\nPERFORMANCE ASSESSMENT:")
if models_mae_temp[best_temp_mae_idx] < 1.0:
    print("Temperature prediction: EXCELLENT")
elif models_mae_temp[best_temp_mae_idx] < 2.0:
    print("Temperature prediction: GOOD") 
else:
    print("Temperature prediction: NEEDS IMPROVEMENT")

if models_mae_rain[best_rain_mae_idx] < 5.0:
    print("Rainfall prediction: EXCELLENT")
elif models_mae_rain[best_rain_mae_idx] < 10.0:
    print("Rainfall prediction: GOOD")
else:
    print("Rainfall prediction: NEEDS IMPROVEMENT")

# 5. Show sample predictions vs actual
print("\n" + "="*60)
print("SAMPLE PREDICTIONS vs ACTUAL VALUES")
print("="*60)

print(f"{'Date':<12} {'Actual Temp':<12} {'Pred Temp':<12} {'Error':<10} {'Actual Rain':<12} {'Pred Rain':<12} {'Error':<10}")
print("-" * 90)

# Get dates from the cleaned dataframe
dates = df_clean.index[-len(y_temp):] if hasattr(df_clean, 'index') else range(len(y_temp))

for i in range(min(5, len(y_temp))):
    date_str = str(dates[i]) if i < len(dates) else f"Day_{i+1}"
    temp_error = y_temp[i] - temp_predictions[i]
    rain_error = y_rain[i] - rain_predictions[i]
    
    print(f"{date_str:<12} {y_temp[i]:<12.2f} {temp_predictions[i]:<12.2f} {temp_error:<10.2f} "
          f"{y_rain[i]:<12.2f} {rain_predictions[i]:<12.2f} {rain_error:<10.2f}")

In [None]:
# Seasonal features
df_feat['day_of_year'] = df_feat.index.dayofyear
df_feat['month'] = df_feat.index.month
df_feat['season'] = (df_feat.index.month % 12 + 3) // 3

# Weather-specific features
df_feat['temp_change'] = df_feat[temp_col] - df_feat[f'temp_lag_1']
df_feat['rain_change'] = df_feat[rain_col] - df_feat[f'rain_lag_1']