# Climate Change Impact Assessment and Prediction System for Nepal

This notebook analyzes climate data for Nepal from 2000-2023 to identify trends and patterns related to climate change.

## 1. Data Loading and Initial Exploration

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set(style='whitegrid')
plt.style.use('fivethirtyeight')

# Display all columns
pd.set_option('display.max_columns', None)

In [10]:
# Load the climate data
df = pd.read_csv('nepal_climate_data_2000_2023.csv')

# Display basic information
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (394470, 9)


Unnamed: 0,YEAR,MO,DY,T2M,T2M_MAX,T2M_MIN,PRECTOTCORR,latitude,longitude
0,2000,1,1,15.36,25.61,7.14,0.0,26,80
1,2000,1,2,15.99,25.08,7.91,0.0,26,80
2,2000,1,3,16.16,24.95,6.35,0.0,26,80
3,2000,1,4,16.16,25.49,6.06,0.0,26,80
4,2000,1,5,16.19,25.24,8.54,0.0,26,80


In [4]:
# Check for missing values
df.info()
print('\nMissing values per column:')
print(df.isnull().sum())

# Basic statistics
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394470 entries, 0 to 394469
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0    YEAR        394470 non-null  int64  
 1   MO           394470 non-null  int64  
 2   DY           394470 non-null  int64  
 3   T2M          394470 non-null  float64
 4   T2M_MAX      394470 non-null  float64
 5   T2M_MIN      394470 non-null  float64
 6   PRECTOTCORR  394470 non-null  float64
 7   latitude     394470 non-null  int64  
 8   longitude    394470 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 27.1 MB

Missing values per column:
 YEAR          0
MO             0
DY             0
T2M            0
T2M_MAX        0
T2M_MIN        0
PRECTOTCORR    0
latitude       0
longitude      0
dtype: int64


Unnamed: 0,YEAR,MO,DY,T2M,T2M_MAX,T2M_MIN,PRECTOTCORR,latitude,longitude
count,394470.0,394470.0,394470.0,394470.0,394470.0,394470.0,394470.0,394470.0,394470.0
mean,2011.498973,6.52293,15.729637,13.636265,20.233459,8.120974,2.547686,28.0,84.0
std,6.922245,3.448708,8.800103,14.43521,13.918488,14.866488,6.948993,1.414215,2.581992
min,2000.0,1.0,1.0,-29.36,-20.55,-40.35,0.0,26.0,80.0
25%,2005.0,4.0,8.0,2.8,9.67,-3.35,0.0,27.0,82.0
50%,2011.5,7.0,16.0,16.19,22.99,10.51,0.03,28.0,84.0
75%,2017.0,10.0,23.0,26.07,31.26,20.93,1.8,29.0,86.0
max,2023.0,12.0,31.0,42.19,49.84,36.18,287.54,30.0,88.0


## 2. Data Preprocessing

In [11]:
# Create date column
df['date'] = pd.to_datetime(df[['YEAR', 'MO', 'DY']])

# Create season column
def get_season(month):
    if month in [12, 1, 2]:  # Winter
        return 'Winter'
    elif month in [3, 4, 5]:  # Spring
        return 'Spring'
    elif month in [6, 7, 8]:  # Summer/Monsoon
        return 'Summer'
    else:  # Fall
        return 'Fall'

df['season'] = df['MO'].apply(get_season)

# Create derived features
df['temp_range'] = df['T2M_MAX'] - df['T2M_MIN']  # Daily temperature range

# Check the processed data
df.head()

KeyError: "['YEAR'] not in index"

## 3. Exploratory Data Analysis

In [None]:
# Annual temperature trends
yearly_temp = df.groupby('YEAR')[['T2M', 'T2M_MAX', 'T2M_MIN']].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_temp['YEAR'], yearly_temp['T2M'], marker='o', linewidth=2, label='Average Temperature')
plt.plot(yearly_temp['YEAR'], yearly_temp['T2M_MAX'], marker='^', linewidth=2, label='Max Temperature')
plt.plot(yearly_temp['YEAR'], yearly_temp['T2M_MIN'], marker='v', linewidth=2, label='Min Temperature')
plt.title('Annual Temperature Trends in Nepal (2000-2023)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Temperature (°C)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Precipitation trends
yearly_precip = df.groupby('YEAR')['PRECTOTCORR'].sum().reset_index()

plt.figure(figsize=(12, 6))
plt.bar(yearly_precip['YEAR'], yearly_precip['PRECTOTCORR'], color='skyblue')
plt.title('Annual Precipitation in Nepal (2000-2023)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Total Precipitation (mm)', fontsize=12)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Seasonal analysis
seasonal_temp = df.groupby(['YEAR', 'season'])[['T2M', 'PRECTOTCORR']].mean().reset_index()

plt.figure(figsize=(14, 8))
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
    season_data = seasonal_temp[seasonal_temp['season'] == season]
    plt.plot(season_data['YEAR'], season_data['T2M'], marker='o', linewidth=2, label=season)

plt.title('Seasonal Temperature Trends in Nepal (2000-2023)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Average Temperature (°C)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Temperature distribution by season
plt.figure(figsize=(14, 8))
sns.boxplot(x='season', y='T2M', data=df)
plt.title('Temperature Distribution by Season (2000-2023)', fontsize=16)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Temperature (°C)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Extreme weather events analysis
# Define extreme temperature thresholds (adjust based on Nepal's climate)
extreme_heat_threshold = df['T2M_MAX'].quantile(0.95)  # 95th percentile for extreme heat
extreme_cold_threshold = df['T2M_MIN'].quantile(0.05)  # 5th percentile for extreme cold
heavy_rain_threshold = df['PRECTOTCORR'].quantile(0.95)  # 95th percentile for heavy rain

# Count extreme events by year
df['extreme_heat'] = df['T2M_MAX'] > extreme_heat_threshold
df['extreme_cold'] = df['T2M_MIN'] < extreme_cold_threshold
df['heavy_rain'] = df['PRECTOTCORR'] > heavy_rain_threshold

extreme_events = df.groupby('YEAR').agg({
    'extreme_heat': 'sum',
    'extreme_cold': 'sum',
    'heavy_rain': 'sum'
}).reset_index()

plt.figure(figsize=(14, 8))
plt.plot(extreme_events['YEAR'], extreme_events['extreme_heat'], marker='o', linewidth=2, label='Extreme Heat Days')
plt.plot(extreme_events['YEAR'], extreme_events['extreme_cold'], marker='^', linewidth=2, label='Extreme Cold Days')
plt.plot(extreme_events['YEAR'], extreme_events['heavy_rain'], marker='s', linewidth=2, label='Heavy Rain Days')
plt.title('Extreme Weather Events in Nepal (2000-2023)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Days', fontsize=12)
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

## 4. Machine Learning Model Development

In [None]:
# Prepare data for modeling
# We'll create a model to predict average temperature based on various features

# Create time-based features
df['year'] = df['YEAR']
df['month'] = df['MO']
df['day'] = df['DY']
df['day_of_year'] = df['date'].dt.dayofyear

# One-hot encode season
season_dummies = pd.get_dummies(df['season'], prefix='season')
df = pd.concat([df, season_dummies], axis=1)

# Select features and target
features = ['year', 'month', 'day', 'day_of_year', 'latitude', 'longitude',
           'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter']

# For temperature prediction
X_temp = df[features]
y_temp = df['T2M']

# For precipitation prediction
X_precip = df[features]
y_precip = df['PRECTOTCORR']

# Split data into training and testing sets
X_temp_train, X_temp_test, y_temp_train, y_temp_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)
X_precip_train, X_precip_test, y_precip_train, y_precip_test = train_test_split(X_precip, y_precip, test_size=0.2, random_state=42)

In [None]:
# Train temperature prediction models
# Linear Regression
lr_temp = LinearRegression()
lr_temp.fit(X_temp_train, y_temp_train)
lr_temp_pred = lr_temp.predict(X_temp_test)

# Random Forest
rf_temp = RandomForestRegressor(n_estimators=100, random_state=42)
rf_temp.fit(X_temp_train, y_temp_train)
rf_temp_pred = rf_temp.predict(X_temp_test)

# Gradient Boosting
gb_temp = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_temp.fit(X_temp_train, y_temp_train)
gb_temp_pred = gb_temp.predict(X_temp_test)

# Evaluate temperature models
print('Temperature Prediction Models Evaluation:
')
print('Linear Regression:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_temp_test, lr_temp_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_temp_test, lr_temp_pred):.4f}')
print(f'R²: {r2_score(y_temp_test, lr_temp_pred):.4f}
')

print('Random Forest:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_temp_test, rf_temp_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_temp_test, rf_temp_pred):.4f}')
print(f'R²: {r2_score(y_temp_test, rf_temp_pred):.4f}
')

print('Gradient Boosting:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_temp_test, gb_temp_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_temp_test, gb_temp_pred):.4f}')
print(f'R²: {r2_score(y_temp_test, gb_temp_pred):.4f}')

In [None]:
# Train precipitation prediction models
# Linear Regression
lr_precip = LinearRegression()
lr_precip.fit(X_precip_train, y_precip_train)
lr_precip_pred = lr_precip.predict(X_precip_test)

# Random Forest
rf_precip = RandomForestRegressor(n_estimators=100, random_state=42)
rf_precip.fit(X_precip_train, y_precip_train)
rf_precip_pred = rf_precip.predict(X_precip_test)

# Gradient Boosting
gb_precip = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_precip.fit(X_precip_train, y_precip_train)
gb_precip_pred = gb_precip.predict(X_precip_test)

# Evaluate precipitation models
print('Precipitation Prediction Models Evaluation:
')
print('Linear Regression:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_precip_test, lr_precip_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_precip_test, lr_precip_pred):.4f}')
print(f'R²: {r2_score(y_precip_test, lr_precip_pred):.4f}
')

print('Random Forest:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_precip_test, rf_precip_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_precip_test, rf_precip_pred):.4f}')
print(f'R²: {r2_score(y_precip_test, rf_precip_pred):.4f}
')

print('Gradient Boosting:')
print(f'RMSE: {np.sqrt(mean_squared_error(y_precip_test, gb_precip_pred)):.4f}')
print(f'MAE: {mean_absolute_error(y_precip_test, gb_precip_pred):.4f}')
print(f'R²: {r2_score(y_precip_test, gb_precip_pred):.4f}')

In [None]:
# Feature importance analysis
# For temperature model
feature_importance_temp = pd.DataFrame({
    'Feature': features,
    'Importance': rf_temp.feature_importances_
})
feature_importance_temp = feature_importance_temp.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_temp)
plt.title('Feature Importance for Temperature Prediction', fontsize=16)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Future Climate Prediction

In [None]:
# Create future prediction dataset (for next 5 years)
future_years = list(range(2024, 2029))
future_data = []

for year in future_years:
    for month in range(1, 13):  # 12 months
        days_in_month = 30  # Simplified
        for day in range(1, days_in_month + 1):
            date = pd.Timestamp(year=year, month=month, day=day)
            season = get_season(month)
            
            # Create a row for each location (simplified to one location for this example)
            future_data.append({
                'year': year,
                'month': month,
                'day': day,
                'day_of_year': date.dayofyear,
                'latitude': 26,  # Using the same location as in the dataset
                'longitude': 80,
                'season_Fall': 1 if season == 'Fall' else 0,
                'season_Spring': 1 if season == 'Spring' else 0,
                'season_Summer': 1 if season == 'Summer' else 0,
                'season_Winter': 1 if season == 'Winter' else 0
            })

future_df = pd.DataFrame(future_data)

# Predict future temperatures and precipitation
future_df['predicted_temp'] = gb_temp.predict(future_df[features])
future_df['predicted_precip'] = gb_precip.predict(future_df[features])

# Aggregate by year and month for visualization
future_monthly = future_df.groupby(['year', 'month']).agg({
    'predicted_temp': 'mean',
    'predicted_precip': 'sum'
}).reset_index()

# Create date column for plotting
future_monthly['date'] = pd.to_datetime(future_monthly[['year', 'month']].assign(day=1))

# Plot future temperature predictions
plt.figure(figsize=(14, 6))
plt.plot(future_monthly['date'], future_monthly['predicted_temp'], marker='o', linewidth=2)
plt.title('Predicted Monthly Average Temperature (2024-2028)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Temperature (°C)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Save Models for Streamlit App

In [None]:
# Save the best models for use in the Streamlit app
import pickle

# Save temperature model
with open('temp_model.pkl', 'wb') as f:
    pickle.dump(gb_temp, f)

# Save precipitation model
with open('precip_model.pkl', 'wb') as f:
    pickle.dump(gb_precip, f)

print('Models saved successfully!')