In [32]:
# Uvoz potrebnih knjižnic
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

In [17]:
# Branje podatkov iz CSV datoteke
df = pd.read_csv('mbajk_dataset.csv', parse_dates=['date'])

In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,bike_stands,available_bike_stands
0,0,2023-06-25 19:07:30+00:00,25.1,45,12.4,24.7,0.0,0.0,984.3,22,8
1,1,2023-06-25 20:25:41+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8
2,2,2023-06-25 20:12:20+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,12
3,3,2023-06-25 19:33:48+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,7
4,4,2023-06-25 19:55:29+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8


In [19]:
df.dtypes

Unnamed: 0                                 int64
date                         datetime64[ns, UTC]
temperature                              float64
relative_humidity                          int64
dew_point                                float64
apparent_temperature                     float64
precipitation_probability                float64
rain                                     float64
surface_pressure                         float64
bike_stands                                int64
available_bike_stands                      int64
dtype: object

In [21]:
# Razvrščanje podatkov po času
df = df.sort_values(by='date')
df.head()

Unnamed: 0.1,Unnamed: 0,date,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,bike_stands,available_bike_stands
0,0,2023-06-25 19:07:30+00:00,25.1,45,12.4,24.7,0.0,0.0,984.3,22,8
3,3,2023-06-25 19:33:48+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,7
5,5,2023-06-25 19:45:44+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,6
4,4,2023-06-25 19:55:29+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8
6,6,2023-06-25 20:02:16+00:00,24.2,43,10.9,23.3,0.0,,984.5,22,12


In [22]:
missing_values = df.isnull().sum()
print(missing_values)


Unnamed: 0                      0
date                            0
temperature                  1869
relative_humidity               0
dew_point                       0
apparent_temperature            0
precipitation_probability    1308
rain                         2804
surface_pressure                0
bike_stands                     0
available_bike_stands           0
dtype: int64


In [23]:
print(df['precipitation_probability'], df['rain'], df['temperature'])

0        0.0
3        0.0
5        0.0
4        NaN
6        0.0
        ... 
18686    0.0
18687    0.0
18688    NaN
18690    0.0
18691    0.0
Name: precipitation_probability, Length: 18692, dtype: float64 0        0.0
3        0.0
5        0.0
4        0.0
6        NaN
        ... 
18686    0.0
18687    0.0
18688    0.0
18690    0.0
18691    NaN
Name: rain, Length: 18692, dtype: float64 0        25.1
3        24.2
5        24.2
4        24.2
6        24.2
         ... 
18686    26.1
18687    26.1
18688    26.1
18690    26.5
18691    26.5
Name: temperature, Length: 18692, dtype: float64


In [30]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,date,temperature,relative_humidity,dew_point,apparent_temperature,precipitation_probability,rain,surface_pressure,bike_stands,available_bike_stands
0,2023-06-25 19:07:30+00:00,25.1,45,12.4,24.7,0.0,0.0,984.3,22,8
3,2023-06-25 19:33:48+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,7
5,2023-06-25 19:45:44+00:00,24.2,43,10.9,23.3,0.0,0.0,984.5,22,6
4,2023-06-25 19:55:29+00:00,24.2,43,10.9,23.3,,0.0,984.5,22,8
6,2023-06-25 20:02:16+00:00,24.2,43,10.9,23.3,0.0,,984.5,22,12


In [33]:
# Priprava podatkov
features = df[['relative_humidity', 'dew_point', 'apparent_temperature', 'surface_pressure', 'bike_stands', 'available_bike_stands']]
target = df['precipitation_probability']

# Ločimo podatke, ki imajo ciljno vrednost, od tistih, ki je nimajo
X_train = features[~target.isna()]
y_train = target[~target.isna()]
X_missing = features[target.isna()]

# Učenje modela
rf_precipitation = RandomForestRegressor(n_estimators=100, random_state=42)
rf_precipitation.fit(X_train, y_train)

In [34]:
# Napovedovanje manjkajočih vrednosti
predicted_precipitation = rf_precipitation.predict(X_missing)

# Zapolnitev manjkajočih vrednosti
df.loc[target.isna(), 'precipitation_probability'] = predicted_precipitation

In [35]:
target_rain = df['rain']

# Ločimo podatke, ki imajo ciljno vrednost, od tistih, ki je nimajo
X_train_rain = df.loc[~target_rain.isna(), features.columns.tolist() + ['precipitation_probability']]
y_train_rain = target_rain[~target_rain.isna()]
X_missing_rain = df.loc[target_rain.isna(), features.columns.tolist() + ['precipitation_probability']]

# Učenje modela
rf_rain = RandomForestRegressor(n_estimators=100, random_state=42)
rf_rain.fit(X_train_rain, y_train_rain)

# Napovedovanje manjkajočih vrednosti
predicted_rain = rf_rain.predict(X_missing_rain)

# Zapolnitev manjkajočih vrednosti
df.loc[target_rain.isna(), 'rain'] = predicted_rain


In [36]:
target_temperature = df['temperature']

# Ločimo podatke
X_train_temp = df.loc[~target_temperature.isna(), features.columns.tolist() + ['precipitation_probability', 'rain']]
y_train_temp = target_temperature[~target_temperature.isna()]
X_missing_temp = df.loc[target_temperature.isna(), features.columns.tolist() + ['precipitation_probability', 'rain']]

# Učenje modela
rf_temperature = RandomForestRegressor(n_estimators=100, random_state=42)
rf_temperature.fit(X_train_temp, y_train_temp)

# Napovedovanje manjkajočih vrednosti
predicted_temperature = rf_temperature.predict(X_missing_temp)

# Zapolnitev manjkajočih vrednosti
df.loc[target_temperature.isna(), 'temperature'] = predicted_temperature


In [37]:
missing_values = df.isnull().sum()
print(missing_values)

date                         0
temperature                  0
relative_humidity            0
dew_point                    0
apparent_temperature         0
precipitation_probability    0
rain                         0
surface_pressure             0
bike_stands                  0
available_bike_stands        0
dtype: int64
