# Modele baseline do predykcji deszczu

## Predykcje dzienne

Target:

- czy będzie padać w ciągu następnego dnia?

Modele baseline:

1. będzie tak jak wczoraj
2. losowo, prawdopodobieństwo deszczu 50%
3. losowo, prawdopodobieństwo deszczu proporcjonalne do udziału dni z deszczem w zbiorze treningowym
4. losowo, prawdopodobieństwo deszczu proporcjonalne do udziału dni z deszczem w zbiorze treningowym w danym miesiącu

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report,
    confusion_matrix
)
from pathlib import Path
from sklearn.linear_model import LinearRegression

import pandas as pd
import numpy as np

from sklearn.metrics import (
    classification_report,
    accuracy_score,
    roc_auc_score,
    confusion_matrix
)

plt.style.use('seaborn-v0_8-whitegrid')

DATA_DIR = Path('data')

df = pd.read_csv(DATA_DIR / 'combined_rain.csv', parse_dates=['time'], index_col='time')

In [56]:
daily = (
    df
    .resample("D")
    .agg({
        "prcp": "sum",
        "temp": "mean",
        "pres": "mean",
        "rhum": "mean"
    })
)

daily["rain"] = (daily["prcp"] > 0).astype(int)
daily["month"] = daily.index.month


In [57]:
TEST_DAYS = 365

train = daily.iloc[:-TEST_DAYS]
test  = daily.iloc[-TEST_DAYS:]

y_train = train["rain"]
y_test  = test["rain"]


1. persistence

In [58]:
# przesunięcie o 1 dzień
y_pred_yesterday = daily["rain"].shift(1)

# tylko część testowa
y_pred_yesterday_test = y_pred_yesterday.loc[test.index]
print(classification_report(y_test, y_pred_yesterday_test, digits=3))


              precision    recall  f1-score   support

           0      0.692     0.692     0.692       195
           1      0.647     0.647     0.647       170

    accuracy                          0.671       365
   macro avg      0.670     0.670     0.670       365
weighted avg      0.671     0.671     0.671       365



2. random 50%

In [59]:
rng = np.random.default_rng(seed=42)

y_pred_random_50 = rng.binomial(
    n=1,
    p=0.5,
    size=len(test)
)

y_pred_random_50 = pd.Series(
    y_pred_random_50,
    index=test.index
)

print(classification_report(y_test, y_pred_random_50, digits=3))


              precision    recall  f1-score   support

           0      0.522     0.492     0.507       195
           1      0.453     0.482     0.467       170

    accuracy                          0.488       365
   macro avg      0.487     0.487     0.487       365
weighted avg      0.490     0.488     0.488       365



3. losowo, prawdopodobieństwo deszczu proporcjonalne do udziału dni z deszczem w zbiorze treningowym

In [60]:
p_rain = y_train.mean()

y_pred_random_prior = rng.binomial(
    n=1,
    p=p_rain,
    size=len(test)
)

y_pred_random_prior = pd.Series(
    y_pred_random_prior,
    index=test.index
)
print(classification_report(y_test, y_pred_random_prior, digits=3))



              precision    recall  f1-score   support

           0      0.558     0.518     0.537       195
           1      0.489     0.529     0.508       170

    accuracy                          0.523       365
   macro avg      0.524     0.524     0.523       365
weighted avg      0.526     0.523     0.524       365



4. losowo, prawdopodobieństwo deszczu proporcjonalne do udziału dni z deszczem w zbiorze treningowym w danym miesiącu


In [61]:
# dodaj miesiąc
train = train.copy()
test = test.copy()

# p(rain | month) z train
monthly_rain_prob = (
    train
    .groupby("month")["rain"]
    .mean()
)

monthly_rain_prob


month
1     0.532258
2     0.631579
3     0.532258
4     0.566667
5     0.441176
6     0.455556
7     0.559140
8     0.548387
9     0.466667
10    0.451613
11    0.533333
12    0.548387
Name: rain, dtype: float64

In [62]:
rng = np.random.default_rng(seed=42)

y_pred_random_monthly = []

for ts, row in test.iterrows():
    m = row["month"]

    # zabezpieczenie, gdyby jakiegoś miesiąca nie było w train
    p = monthly_rain_prob.get(m, y_train.mean())

    y_pred_random_monthly.append(
        rng.binomial(n=1, p=p)
    )

y_pred_random_monthly = pd.Series(
    y_pred_random_monthly,
    index=test.index
)
print(classification_report(y_test, y_pred_random_monthly, digits=3))

              precision    recall  f1-score   support

           0      0.583     0.559     0.571       195
           1      0.517     0.541     0.529       170

    accuracy                          0.551       365
   macro avg      0.550     0.550     0.550       365
weighted avg      0.552     0.551     0.551       365



In [63]:
# TODO pomyłki vs ilość deszczu

## Predykcje godzinowe

Target:

- czy będzie padać w ciągu następnej godziny?

Modele baseline:

1. będzie tak jak godzinę temu
2. losowo, prawdopodobieństwo deszczu 50%
3. losowo, prawdopodobieństwo deszczu proporcjonalne do udziału rekordów z deszczem w zbiorze treningowym w danym miesiącu

In [65]:
df["rain"] = (df["prcp"] > 0).astype(int)
# train / test – ostatnie 365 dni = 365*24 godzin
TEST_HOURS = 365 * 24

train = df.iloc[:-TEST_HOURS]
test  = df.iloc[-TEST_HOURS:]

y_train = train["rain"]
y_test  = test["rain"]


In [69]:
# przesunięcie o 1 godzinę
y_pred_prev_hour = df["rain"].shift(1)
y_pred_prev_hour_test = y_pred_prev_hour.loc[test.index]
print(
    classification_report(
        y_test,
        y_pred_prev_hour_test,
        digits=3
    )
)


              precision    recall  f1-score   support

           0      0.963     0.963     0.963      7814
           1      0.696     0.696     0.696       946

    accuracy                          0.934      8760
   macro avg      0.829     0.829     0.829      8760
weighted avg      0.934     0.934     0.934      8760



In [70]:
rng = np.random.default_rng(seed=42)

y_pred_random_50 = rng.binomial(
    n=1,
    p=0.5,
    size=len(test)
)

print(
    classification_report(
        y_test,
        y_pred_random_50,
        digits=3
    )
)


              precision    recall  f1-score   support

           0      0.893     0.504     0.645      7814
           1      0.109     0.503     0.180       946

    accuracy                          0.504      8760
   macro avg      0.501     0.504     0.412      8760
weighted avg      0.809     0.504     0.594      8760



In [72]:
monthly_rain_prob = (
    train
    .groupby("month")["rain"]
    .mean()
)
y_pred_random_monthly = []

for ts, row in test.iterrows():
    m = row["month"]
    p = monthly_rain_prob.get(m, y_train.mean())
    y_pred_random_monthly.append(
        rng.binomial(n=1, p=p)
    )

y_pred_random_monthly = pd.Series(
    y_pred_random_monthly,
    index=test.index
)

print(
    classification_report(
        y_test,
        y_pred_random_monthly,
        digits=3
    )
)


              precision    recall  f1-score   support

           0      0.891     0.873     0.882      7814
           1      0.098     0.114     0.106       946

    accuracy                          0.791      8760
   macro avg      0.494     0.494     0.494      8760
weighted avg      0.805     0.791     0.798      8760

