In [9]:
# Importerer nødvendige biblioteker for databehandling, ML og evaluering
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [10]:
# Leser inn værdata for alle kommuner og filtrerer bort manglende verdier
df = pd.read_csv("../data/processed/historisk_vaer_alle_kommuner_geo.csv")

# Fjern rader der noen værdata mangler
df = df.dropna(subset=[
    'sum(precipitation_amount P1D)',
    'mean(air_temperature P1D)',
    'mean(wind_speed P1D)'
])

# Konverter tid til datetime og sorter per kommune og tid
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values(by=["kommune", "time"])

# Skriv ut datastrukturen
print("Kolonner:", df.columns.tolist())
print("Antall rader:", len(df))
df.head()


Kolonner: ['time', 'station', 'sum(precipitation_amount P1D)', 'kommune', 'mean(air_temperature P1D)', 'mean(wind_speed P1D)', 'latitude', 'longitude']
Antall rader: 240595


Unnamed: 0,time,station,sum(precipitation_amount P1D),kommune,mean(air_temperature P1D),mean(wind_speed P1D),latitude,longitude
58443,1980-01-01 00:00:00+00:00,SN87110:0,4.7,ANDØY,-1.9,13.9,69.3073,16.1312
58444,1980-01-02 00:00:00+00:00,SN87110:0,13.5,ANDØY,-2.9,4.8,69.3073,16.1312
58445,1980-01-03 00:00:00+00:00,SN87110:0,0.0,ANDØY,-5.1,7.0,69.3073,16.1312
58446,1980-01-04 00:00:00+00:00,SN87110:0,0.0,ANDØY,-4.7,6.9,69.3073,16.1312
58447,1980-01-05 00:00:00+00:00,SN87110:0,0.0,ANDØY,-3.5,7.5,69.3073,16.1312


In [12]:
# Lager værfeatures for de siste 7 dagene (lag 1 til 7) for temperatur, nedbør og vind
def create_lag_features(df, n_lags=7):
    for lag in range(1, n_lags + 1):
        df[f"temp_lag_{lag}"] = df["mean(air_temperature P1D)"].shift(lag)
        df[f"nedbor_lag_{lag}"] = df["sum(precipitation_amount P1D)"].shift(lag)
        df[f"vind_lag_{lag}"] = df["mean(wind_speed P1D)"].shift(lag)
    return df


In [13]:
# Lagrer kommunevis features og bygger target (temperatur neste dag)
def create_features_for_all(df, n_lags=7):
    df_all = []
    for kommune, group in df.groupby("kommune"):
        group = group.copy()
        group = create_lag_features(group, n_lags)
        group["month"] = group["time"].dt.month
        group["dayofyear"] = group["time"].dt.dayofyear
        group["target_temp_t+1"] = group["mean(air_temperature P1D)"].shift(-1)
        df_all.append(group)
    return pd.concat(df_all).dropna()

df_all_features = create_features_for_all(df)
print("Features klart! Antall rader etter lagging og shifting:", len(df_all_features))
df_all_features.head()


Features klart! Antall rader etter lagging og shifting: 240242


Unnamed: 0,time,station,sum(precipitation_amount P1D),kommune,mean(air_temperature P1D),mean(wind_speed P1D),latitude,longitude,temp_lag_1,nedbor_lag_1,...,vind_lag_5,temp_lag_6,nedbor_lag_6,vind_lag_6,temp_lag_7,nedbor_lag_7,vind_lag_7,month,dayofyear,target_temp_t+1
58450,1980-01-08 00:00:00+00:00,SN87110:0,2.2,ANDØY,0.8,3.6,69.3073,16.1312,-0.9,0.3,...,7.0,-2.9,13.5,4.8,-1.9,4.7,13.9,1,8,0.9
58451,1980-01-09 00:00:00+00:00,SN87110:0,0.2,ANDØY,0.9,3.8,69.3073,16.1312,0.8,2.2,...,6.9,-5.1,0.0,7.0,-2.9,13.5,4.8,1,9,-1.1
58452,1980-01-10 00:00:00+00:00,SN87110:0,0.0,ANDØY,-1.1,5.2,69.3073,16.1312,0.9,0.2,...,7.5,-4.7,0.0,6.9,-5.1,0.0,7.0,1,10,-1.3
58453,1980-01-11 00:00:00+00:00,SN87110:0,0.0,ANDØY,-1.3,5.0,69.3073,16.1312,-1.1,0.0,...,6.9,-3.5,0.0,7.5,-4.7,0.0,6.9,1,11,-0.6
58454,1980-01-12 00:00:00+00:00,SN87110:0,1.1,ANDØY,-0.6,8.9,69.3073,16.1312,-1.3,0.0,...,4.8,-3.0,0.0,6.9,-3.5,0.0,7.5,1,12,1.6


In [14]:
# Gjør kommune til en tallkode som modellen kan bruke
le = LabelEncoder()
df_all_features["kommune_encoded"] = le.fit_transform(df_all_features["kommune"])

# Definer input-funksjoner (features) og målvariabel (temperatur neste dag)
feature_cols = [col for col in df_all_features.columns if "lag" in col] + \
               ["month", "dayofyear", "latitude", "longitude", "kommune_encoded"]
X = df_all_features[feature_cols]
y = df_all_features["target_temp_t+1"]

print("Antall features:", len(feature_cols))
X.head()


Antall features: 26


Unnamed: 0,temp_lag_1,nedbor_lag_1,vind_lag_1,temp_lag_2,nedbor_lag_2,vind_lag_2,temp_lag_3,nedbor_lag_3,vind_lag_3,temp_lag_4,...,nedbor_lag_6,vind_lag_6,temp_lag_7,nedbor_lag_7,vind_lag_7,month,dayofyear,latitude,longitude,kommune_encoded
58450,-0.9,0.3,4.8,-3.0,0.0,6.9,-3.5,0.0,7.5,-4.7,...,13.5,4.8,-1.9,4.7,13.9,1,8,69.3073,16.1312,0
58451,0.8,2.2,3.6,-0.9,0.3,4.8,-3.0,0.0,6.9,-3.5,...,0.0,7.0,-2.9,13.5,4.8,1,9,69.3073,16.1312,0
58452,0.9,0.2,3.8,0.8,2.2,3.6,-0.9,0.3,4.8,-3.0,...,0.0,6.9,-5.1,0.0,7.0,1,10,69.3073,16.1312,0
58453,-1.1,0.0,5.2,0.9,0.2,3.8,0.8,2.2,3.6,-0.9,...,0.0,7.5,-4.7,0.0,6.9,1,11,69.3073,16.1312,0
58454,-1.3,0.0,5.0,-1.1,0.0,5.2,0.9,0.2,3.8,0.8,...,0.0,6.9,-3.5,0.0,7.5,1,12,69.3073,16.1312,0


In [17]:
# Del opp i trenings- og testdata (80% trening, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Tren en Random Forest-regresjonsmodell (bruker 10 trær for rask kjøring)
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    n_jobs=-1,       # Bruk alle CPU-kjerner
    verbose=1,       # Viser fremgang for hvert tre
    random_state=42
)

model.fit(X_train, y_train)

# Lag prediksjoner og evaluer modellen
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Modellens nøyaktighet:")
print(f"- MAE (snitt feil): {mae:.2f} °C")
print(f"- R² (forklart variasjon): {r2:.2f}")


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   45.2s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s


📊 Modellens nøyaktighet:
- MAE (snitt feil): 2.58 °C
- R² (forklart variasjon): 0.83


[Parallel(n_jobs=24)]: Done 300 out of 300 | elapsed:    0.1s finished
