### **1. Open Dataset**

In [1]:
import xarray as xr

ds = xr.open_dataset('dataset_precip.nc')
print(ds)

<xarray.Dataset> Size: 15kB
Dimensions:  (time: 730)
Coordinates:
  * time     (time) datetime64[ns] 6kB 2022-01-01T12:00:00 ... 2023-12-31T12:...
Data variables:
    tcwv     (time) float32 3kB ...
    tcc      (time) float32 3kB ...
    tp       (time) float32 3kB ...
Attributes:
    description:    Merged ERA5 and MODIS AOD dataset
    creation_date:  2024-12-29
    time_zone:      UTC
    region:         51E-52E, 35N-36N



### **2. Handle Missing Data (if any)**
Check if there are any missing values and handle them appropriately.


In [2]:
import pandas as pd

df = ds.to_dataframe().reset_index()

# print(df.isnull().sum())

df = df.dropna()

#rescale TP to mm/day
df[['tp']] = df[['tp']] * 1000


### **3. Train-Test Split for Modeling**
Use the DataFrame for machine learning.


In [3]:
#train test split the dataset
from sklearn.model_selection import train_test_split


X = df[['tcwv', 'tcc']]
y = df[['tp']] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)


print(X_train.shape)
print(X_test.shape)

(584, 2)
(146, 2)


### **4. Train a Decision Tree Regressor**
Train the regressor using the data.


In [4]:
#train a Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

#setup the model
DTR = DecisionTreeRegressor(max_depth=5, random_state=42)

DTR.fit(X_train, y_train)


y_predict = DTR.predict(X_test)

mse = mean_squared_error(y_predict, y_test)

print("Mean Squared Error:", mse)

Mean Squared Error: 0.06816359397164122



### **5. Interpret the Tree**
Interpret the tree to understand how it makes predictions.

In [5]:
from sklearn.tree import export_text

tree_rules = export_text(DTR, feature_names=['tcwv', 'tcc'])
print(f'Tree Rules:\n {tree_rules}')


Tree Rules:
 |--- tcc <= 0.73
|   |--- tcc <= 0.25
|   |   |--- tcc <= 0.03
|   |   |   |--- tcc <= 0.01
|   |   |   |   |--- tcwv <= 9.71
|   |   |   |   |   |--- value: [0.00]
|   |   |   |   |--- tcwv >  9.71
|   |   |   |   |   |--- value: [0.00]
|   |   |   |--- tcc >  0.01
|   |   |   |   |--- tcwv <= 12.36
|   |   |   |   |   |--- value: [0.01]
|   |   |   |   |--- tcwv >  12.36
|   |   |   |   |   |--- value: [0.02]
|   |   |--- tcc >  0.03
|   |   |   |--- tcwv <= 11.98
|   |   |   |   |--- tcwv <= 6.86
|   |   |   |   |   |--- value: [0.01]
|   |   |   |   |--- tcwv >  6.86
|   |   |   |   |   |--- value: [0.02]
|   |   |   |--- tcwv >  11.98
|   |   |   |   |--- tcwv <= 12.16
|   |   |   |   |   |--- value: [0.20]
|   |   |   |   |--- tcwv >  12.16
|   |   |   |   |   |--- value: [0.03]
|   |--- tcc >  0.25
|   |   |--- tcwv <= 12.09
|   |   |   |--- tcwv <= 10.22
|   |   |   |   |--- tcwv <= 5.21
|   |   |   |   |   |--- value: [0.00]
|   |   |   |   |--- tcwv >  5.21
|   |

### **Predict for a New Day**
Use the trained model to predict precipitation for new conditions.


In [6]:
# Example: Predict for tcwv=20.0, tcc=0.7
sample_prediction = DTR.predict([[13.0, 0.7]])
print(f"Predicted precipitation for tcwv=20.0, tcc=0.7: {sample_prediction[0]:.2f} mm")


Predicted precipitation for tcwv=20.0, tcc=0.7: 0.10 mm





## Lag Features?

- **Temporal Dependency**: Today’s precipitation might depend on how water vapor or cloud cover evolved in the days leading up to today.  
- **Improved Predictive Power**: Adding lags often improves accuracy because weather processes have memory (e.g., if it was very humid yesterday, it might still be humid today).


In [7]:
# Create 1-day lag features
df['tcwv_lag1'] = df['tcwv'].shift(1)
df['tcc_lag1']  = df['tcc'].shift(1)
df['tp_lag1']   = df['tp'].shift(1)

# Create 2-day lag features
df['tcwv_lag2'] = df['tcwv'].shift(2)
df['tcc_lag2']  = df['tcc'].shift(2)
df['tp_lag2']   = df['tp'].shift(2)


In [8]:
df = df.dropna(subset=['tcwv_lag1', 'tcc_lag1', 'tp_lag1', 
                       'tcwv_lag2', 'tcc_lag2', 'tp_lag2'])


In [9]:
# Sort by time just to be safe
df = df.sort_values('time').reset_index(drop=True)

# Choose a cutoff date for training vs. testing
cutoff_date = '2023-07-01'
df_train = df[df['time'] < cutoff_date].copy()
df_test  = df[df['time'] >= cutoff_date].copy()

# Define features (X) and target (y)
FEATURES = [
    'tcwv', 'tcc', 
    'tp_lag1', 'tcwv_lag1', 'tcc_lag1',
    'tp_lag2', 'tcwv_lag2', 'tcc_lag2'
]
X_train = df_train[FEATURES]
y_train = df_train['tp']  # Predict today's precipitation

X_test = df_test[FEATURES]
y_test = df_test['tp']


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Initialize the model (tune max_depth, min_samples_leaf, etc. as needed)
DTR_2 = DecisionTreeRegressor(max_depth=5, random_state=42)

# Fit on the training set
DTR_2.fit(X_train, y_train)

# Predict on the test set
y_pred = DTR_2.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error with lag features:", mse)


Mean Squared Error with lag features: 0.007854040315639157


In [11]:
from sklearn.tree import export_text

tree_rules = export_text(DTR_2, feature_names=FEATURES)
print(tree_rules)


|--- tcc <= 0.60
|   |--- tcwv_lag1 <= 22.13
|   |   |--- tcc <= 0.18
|   |   |   |--- tcc_lag1 <= 0.99
|   |   |   |   |--- tcc <= 0.02
|   |   |   |   |   |--- value: [0.00]
|   |   |   |   |--- tcc >  0.02
|   |   |   |   |   |--- value: [0.01]
|   |   |   |--- tcc_lag1 >  0.99
|   |   |   |   |--- tcc_lag1 <= 1.00
|   |   |   |   |   |--- value: [0.28]
|   |   |   |   |--- tcc_lag1 >  1.00
|   |   |   |   |   |--- value: [0.02]
|   |   |--- tcc >  0.18
|   |   |   |--- tcwv <= 12.38
|   |   |   |   |--- tcc <= 0.18
|   |   |   |   |   |--- value: [0.20]
|   |   |   |   |--- tcc >  0.18
|   |   |   |   |   |--- value: [0.02]
|   |   |   |--- tcwv >  12.38
|   |   |   |   |--- tcwv_lag1 <= 8.27
|   |   |   |   |   |--- value: [0.35]
|   |   |   |   |--- tcwv_lag1 >  8.27
|   |   |   |   |   |--- value: [0.06]
|   |--- tcwv_lag1 >  22.13
|   |   |--- tcwv_lag2 <= 29.47
|   |   |   |--- tcc_lag2 <= 0.85
|   |   |   |   |--- tcwv_lag1 <= 25.74
|   |   |   |   |   |--- value: [0.04]
|   

In [12]:
# Input values for prediction
new_data = {
    'tcwv':       [17.0],  
    'tcc':        [0.7],   
    'tp_lag1':    [0.0],    
    'tcwv_lag1':  [18.2],   
    'tcc_lag1':   [1.0],   
    'tp_lag2':    [0.0],   
    'tcwv_lag2':  [25.0],   
    'tcc_lag2':   [0.95]
}
# Create a DataFrame
new_input = pd.DataFrame(new_data)


print(new_input)


   tcwv  tcc  tp_lag1  tcwv_lag1  tcc_lag1  tp_lag2  tcwv_lag2  tcc_lag2
0  17.0  0.7      0.0       18.2       1.0      0.0       25.0      0.95


In [13]:

# Predict using the trained model
predicted_tp = DTR_2.predict(new_input)
print(f"Predicted precipitation: {predicted_tp[0]:.2f} mm")

Predicted precipitation: 2.09 mm
