In [48]:
import pandas as pd
import numpy as np
data = pd.read_csv("./C0A880_2023_daily.csv")
data = data.rename(columns={"Unnamed: 0": "obs_time"})
data = data.drop_duplicates(subset='obs_time', keep='first')
data = data.set_index("obs_time")
data = data.drop(columns=["SunShine"])

num_cols = data.select_dtypes(include="number").columns
data[num_cols] = data[num_cols].mask(data[num_cols] < 0, np.nan)



In [49]:
null_pct = data.select_dtypes(include="number").apply(pd.isnull).sum()/data.shape[0]
null_pct

StnPres        0.740437
StnPresMax     0.120219
StnPresMin     0.120219
Tx             0.740437
TxMaxAbs       0.120219
TxMinAbs       0.120219
RH             0.740437
RHMin          0.120219
WS             0.740437
WD             0.740437
WSGust         0.120219
WDGust         0.120219
Precp          0.000000
GloblRad       0.010929
TxSoil0cm      0.010929
TxSoil5cm      0.010929
TxSoil10cm     0.010929
TxSoil20cm     0.010929
TxSoil50cm     0.010929
TxSoil100cm    0.010929
dtype: float64

In [50]:
valid_columns = list(null_pct[null_pct < 0.5].index)

if "Precp" not in valid_columns:
    valid_columns.append("Precp")



In [52]:
data = data[valid_columns].copy()

In [54]:
data = data.ffill()

In [55]:
data.apply(pd.isnull).sum()

StnPresMax     0
StnPresMin     0
TxMaxAbs       0
TxMinAbs       0
RHMin          0
WSGust         0
WDGust         0
Precp          0
GloblRad       0
TxSoil0cm      0
TxSoil5cm      0
TxSoil10cm     0
TxSoil20cm     0
TxSoil50cm     0
TxSoil100cm    0
dtype: int64

In [56]:
data.index = pd.to_datetime(data.index)

In [57]:
data.index.year.value_counts().sort_index()

obs_time
2023    365
2024      1
Name: count, dtype: int64

In [58]:
data

Unnamed: 0_level_0,StnPresMax,StnPresMin,TxMaxAbs,TxMinAbs,RHMin,WSGust,WDGust,Precp,GloblRad,TxSoil0cm,TxSoil5cm,TxSoil10cm,TxSoil20cm,TxSoil50cm,TxSoil100cm
obs_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01,1023.4,1020.1,18.7,16.5,82.0,7.7,63.0,0.0,1.00,17.1,17.0,16.8,16.6,16.3,16.6
2023-01-02,1023.6,1019.3,21.9,16.1,65.0,11.0,50.0,27.0,3.19,18.0,17.8,17.6,17.2,16.7,16.7
2023-01-03,1024.6,1021.4,16.6,14.1,92.0,9.3,53.0,54.5,1.60,16.7,16.9,17.0,17.2,17.2,16.9
2023-01-04,1025.0,1021.0,17.5,14.3,90.0,8.2,11.0,38.5,0.60,16.0,16.1,16.2,16.4,16.9,17.1
2023-01-05,1022.6,1019.0,20.4,17.1,92.0,7.7,27.0,14.5,3.00,18.0,17.8,17.4,17.0,16.8,17.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-28,1022.6,1019.3,20.4,17.4,90.0,4.9,295.0,8.5,2.20,17.8,17.7,17.6,17.4,17.5,18.5
2023-12-29,1019.7,1017.7,18.8,18.2,93.0,5.5,47.0,12.5,0.60,18.2,18.1,18.0,17.9,17.9,18.6
2023-12-30,1019.7,1017.7,18.8,18.2,93.0,5.5,47.0,7.5,6.59,18.7,18.6,18.5,18.3,18.2,18.7
2023-12-31,1022.7,1014.6,20.6,14.6,67.0,8.2,43.0,0.0,2.20,17.8,18.0,18.1,18.2,18.5,18.9


In [59]:
def rain_level(mm):
    if mm < 0.5:
        return "no rain"
    elif mm < 7.5:
        return "small rain"
    else:
        return "pouring rain"

data["rain_category"] = data["Precp"].shift(-1).apply(rain_level)

In [60]:
rain_counts = data["rain_category"].value_counts()

In [62]:
rain_counts

rain_category
no rain         184
pouring rain    101
small rain       81
Name: count, dtype: int64

In [63]:
data

Unnamed: 0_level_0,StnPresMax,StnPresMin,TxMaxAbs,TxMinAbs,RHMin,WSGust,WDGust,Precp,GloblRad,TxSoil0cm,TxSoil5cm,TxSoil10cm,TxSoil20cm,TxSoil50cm,TxSoil100cm,rain_category
obs_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2023-01-01,1023.4,1020.1,18.7,16.5,82.0,7.7,63.0,0.0,1.00,17.1,17.0,16.8,16.6,16.3,16.6,pouring rain
2023-01-02,1023.6,1019.3,21.9,16.1,65.0,11.0,50.0,27.0,3.19,18.0,17.8,17.6,17.2,16.7,16.7,pouring rain
2023-01-03,1024.6,1021.4,16.6,14.1,92.0,9.3,53.0,54.5,1.60,16.7,16.9,17.0,17.2,17.2,16.9,pouring rain
2023-01-04,1025.0,1021.0,17.5,14.3,90.0,8.2,11.0,38.5,0.60,16.0,16.1,16.2,16.4,16.9,17.1,pouring rain
2023-01-05,1022.6,1019.0,20.4,17.1,92.0,7.7,27.0,14.5,3.00,18.0,17.8,17.4,17.0,16.8,17.1,pouring rain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-28,1022.6,1019.3,20.4,17.4,90.0,4.9,295.0,8.5,2.20,17.8,17.7,17.6,17.4,17.5,18.5,pouring rain
2023-12-29,1019.7,1017.7,18.8,18.2,93.0,5.5,47.0,12.5,0.60,18.2,18.1,18.0,17.9,17.9,18.6,pouring rain
2023-12-30,1019.7,1017.7,18.8,18.2,93.0,5.5,47.0,7.5,6.59,18.7,18.6,18.5,18.3,18.2,18.7,no rain
2023-12-31,1022.7,1014.6,20.6,14.6,67.0,8.2,43.0,0.0,2.20,17.8,18.0,18.1,18.2,18.5,18.9,no rain


In [65]:
y = data.rain_category
X = data[valid_columns]
label_map = {"no rain": 0, "small rain": 1, "pouring rain": 2}
y_encoded = y.map(label_map)

In [68]:
y_encoded

obs_time
2023-01-01    2
2023-01-02    2
2023-01-03    2
2023-01-04    2
2023-01-05    2
             ..
2023-12-28    2
2023-12-29    2
2023-12-30    0
2023-12-31    0
2024-01-01    2
Name: rain_category, Length: 366, dtype: int64

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

# 1) Split
train_X, val_X, train_y, val_y = train_test_split(
    X, y_encoded, random_state=0, shuffle=False
)

# 2) Train a classifier
clf = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
    class_weight="balanced"  # helps if classes are imbalanced
)
clf.fit(train_X, train_y)

# 3) Predict and evaluate
y_pred = clf.predict(val_X)
print("Accuracy:", accuracy_score(val_y, y_pred))
print("\nClassification Report:\n", classification_report(val_y, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(val_y, y_pred))

Accuracy: 0.45652173913043476

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.94      0.58        35
           1       0.50      0.19      0.28        21
           2       1.00      0.14      0.24        36

    accuracy                           0.46        92
   macro avg       0.64      0.42      0.37        92
weighted avg       0.66      0.46      0.38        92


Confusion Matrix:
 [[33  2  0]
 [17  4  0]
 [29  2  5]]


In [None]:
clf.predict()