In [None]:
# # 12. Syndicate of Civil Engineers (SynOCE): Climate Data Science

# **Stakeholders:**  
# Prathmesh Maharshi (+91 70159 21044), SynocE IITGN  

# ---

# ## Team Size: 1 or 2

# ### Temporal Downscaling Challenge

# Climate and hydrological models often produce coarse-resolution outputs (e.g., monthly or yearly data), which are insufficient for applications requiring high-frequency insights, such as flood forecasting, urban water management, and climate adaptation. This challenge invites participants to develop machine learning (ML) or statistical approaches to **downscale coarse-resolution time series data (e.g., monthly rainfall, temperature, or river discharge)** into **high-resolution daily or hourly data** for enhanced civil and environmental engineering decision-making.

# #### Objectives
# - Develop a method to predict **finer-resolution (daily/hourly) values** from coarse data.
# - Evaluate model performance using statistical and physical consistency metrics.
# - Apply the downscaled data to real-world applications such as hydrology, disaster management, and climate modelling.

# #### Challenge Details

# **A. Provided Dataset**
# 1. **Coarse-resolution time series data** (e.g., monthly climate or hydrological variables).
# 2. **Observed high-resolution (daily/hourly) data** for validation.
# 3. Additional environmental parameters such as elevation, humidity, and wind speed (optional).

# **B. Expected Solution Approaches**  
# Participants may use one or a combination of the following techniques:

# 1. **Traditional Statistical Downscaling**
#    - Linear interpolation methods
#    - Multiple regression models
#    - Wavelet transform for multi-scale decomposition

# 2. **Machine Learning-Based Downscaling**
#    - Random Forest, XGBoost, or Support Vector Regression (SVR)
#    - Recurrent Neural Networks (RNNs) such as LSTM/GRU
#    - Gaussian Processes for Probabilistic Predictions

# 3. **Deep Learning Super-Resolution for Time Series**
#    - Temporal Convolutional Networks (TCN)
#    - Transformer-based models for time series forecasting
#    - Generative Adversarial Networks (GANs) for realistic high-frequency data

# **Evaluation Criteria**

# | Metric                        | Description                                                                 |
# |-------------------------------|-----------------------------------------------------------------------------|
# | Nash-Sutcliffe Efficiency (NSE) | Measures predictive power (closer to 1 is better)                          |
# | Root Mean Square Error (RMSE)  | Lower values indicate better accuracy                                      |
# | Mean Absolute Error (MAE)      | Measures absolute prediction error                                         |
# | Physical Consistency           | Does the predicted time series maintain realistic hydrological/climatic behaviour? |

# **Bonus Considerations**
# - Interpretability: Clear explanation of model behaviour.
# - Generalizability: Ability to apply the model to different datasets.
# - Computational Efficiency: Feasibility of real-time deployment.

# **Free Resources for Participants**
# 3. **Useful Libraries**
#    - **Scikit-learn**: Machine learning algorithms.
#    - **TensorFlow/PyTorch**: Deep learning frameworks.
#    - **Statsmodels**: Statistical modelling in Python.
#    - **xarray & pandas**: Time series data handling.

# ### Expected Outcomes
# - A trained model capable of generating daily/hourly time series from monthly data.
# - Performance analysis of different downscaling techniques.
# - Real-world application in hydrology, climate modelling, and disaster preparedness.

# This challenge aims to encourage innovative applications of ML/AI in civil and environmental engineering while equipping participants with valuable skills in spatiotemporal data analysis. Best of luck!

### Importing Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# set all seed
def set_seed(seed=42):
    np.random.seed(seed)
seed = 0
set_seed(seed)

### Defining Important Functions

In [None]:
def plot_data(df, x_col, y_col, title, xlabel, ylabel, figsize=(15,5)):
    plt.figure(figsize=figsize)
    plt.plot(df[x_col], df[y_col])
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid()
    plt.show()

def calculate_nse(observed, predicted):
    """Calculate Nash-Sutcliffe Efficiency (NSE)"""
    numerator = np.sum((observed - predicted) ** 2)
    denominator = np.sum((observed - np.mean(observed)) ** 2)
    nse = 1 - (numerator / denominator)
    return nse

def calculate_rmse(observed, predicted):
    """Calculate Root Mean Square Error (RMSE)"""
    rmse = np.sqrt(np.mean((observed - predicted) ** 2))
    return rmse


def calculate_mae(observed, predicted):
    """Calculate Mean Absolute Error (MAE)"""
    mae = np.mean(np.abs(observed - predicted))
    return mae

def calculate_physical_consistency(observed, predicted):
    """Check if predicted values are realistic (e.g., non-negative)"""
    consistency = np.all(predicted >= 0)
    return consistency


### Loading Data

In [None]:
data_dir = './data/'

train_df = pd.read_csv(os.path.join(data_dir, 'train_data.csv'))
train_df['valid_time'] = pd.to_datetime(train_df['valid_time'])
print("Number of rows in training data:", len(train_df))

# val_df = pd.read_csv(os.path.join(data_dir, 'val_data.csv'))
# val_df['valid_time'] = pd.to_datetime(val_df['valid_time'])
# print("Number of rows in validation data:", len(val_df))

# test_df = pd.read_csv(os.path.join(data_dir, 'test_data.csv'))
# test_df['valid_time'] = pd.to_datetime(test_df['valid_time'])
# print("Number of rows in test data:", len(test_df))

Number of rows in training data: 87672
