In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

from data_processing_functions import *

## 1. Read Data

In [31]:
data_train, data_test = get_train_and_test_data('./data', amount_of_days=3, wind_border=8, convert_str_variable_flag=True)
x_data_train, y_data_wind_train, y_data_temperature_train = data_train
x_data_test, y_data_wind_test, y_data_temperature_test = data_test

## 2. Wind classification

In [18]:
clf = MLPClassifier(alpha=1e-3, hidden_layer_sizes=(500), random_state=1)

In [20]:
clf.fit(x_data_train, y_data_wind_train)

MLPClassifier(alpha=0.001, hidden_layer_sizes=500, random_state=1)

In [21]:
accuracy_score(clf.predict(x_data_test), y_data_wind_test)

0.8554036251306046

In [11]:
clf = MLPClassifier(alpha=1e-5, solver='adam', hidden_layer_sizes=(100, 20), random_state=1, n_iter_no_change= 100)
clf.fit(x_data_train, y_data_wind_train)
predicted = clf.predict(x_data_test)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test, predicted)))

Accuracy: 0.85538
Resampled balanced accuracy: 0.50000




In [12]:
np.unique(predicted)

array([0], dtype=uint8)

## 3. Temperature prediction

In [23]:
regr = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)

In [24]:
regr.fit(x_data_train, y_data_temperature_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor(alpha=1e-05, hidden_layer_sizes=(100, 20), random_state=1,
             solver='lbfgs')

In [25]:
err = abs(regr.predict(x_data_test) - y_data_temperature_test)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 5.05 RMSE: 6.92


In [36]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train, y_data_temperature_train)
err = abs(regr.predict(x_data_test) - y_data_temperature_test)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 5.16 RMSE: 6.92


# Study issue IV

### Zastąpienie null średnią

In [7]:
data_train, data_test = get_train_and_test_data('./data', amount_of_days=3, wind_border=8, convert_str_variable_flag=True, remove_nulls = 2)
x_data_train, y_data_wind_train, y_data_temperature_train = data_train
x_data_test, y_data_wind_test, y_data_temperature_test = data_test

  df[df["datetime"].str.startswith(date)] = df_tmp.fillna(df_tmp.mean())


In [10]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train, y_data_wind_train)
predicted = clf.predict(x_data_test)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test, predicted)))

Accuracy: 0.73477
Resampled balanced accuracy: 0.50000


In [11]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train, y_data_temperature_train)
err = abs(regr.predict(x_data_test) - y_data_temperature_test)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 5.05 RMSE: 6.64


### Zastąpienie null wartościami powyżej i poniżej

In [11]:
data_train, data_test = get_train_and_test_data('./data', amount_of_days=3, wind_border=8, convert_str_variable_flag=True, remove_nulls = 3)
x_data_train, y_data_wind_train, y_data_temperature_train = data_train
x_data_test, y_data_wind_test, y_data_temperature_test = data_test

In [12]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train, y_data_wind_train)
predicted = clf.predict(x_data_test)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test, predicted)))

Accuracy: 0.73942
Resampled balanced accuracy: 0.50000


In [13]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train, y_data_temperature_train)
err = abs(regr.predict(x_data_test) - y_data_temperature_test)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 5.24 RMSE: 6.77


# Study issue V

In [4]:
data_train, dest_test = get_train_and_test_data_with_separate_day('./data', amount_of_days=4, wind_border=8, convert_str_variable_flag=True)
x_data_train, y_data_wind_train1, y_data_temperature_train1, y_data_wind_train2, y_data_temperature_train2 = data_train
x_data_test, y_data_wind_test1, y_data_temperature_test1, y_data_wind_test2, y_data_temperature_test2 = data_train

In [28]:
clf1 = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf2 = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf1.fit(x_data_train, y_data_wind_train1)
predicted1 = clf1.predict(x_data_train)
clf2.fit(predicted1.reshape(-1, 1), y_data_wind_train2)

predicted2 = clf1.predict(x_data_test)
predicted = clf2.predict(predicted2.reshape(-1, 1))
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test2, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test2, predicted)))

Accuracy: 0.85538
Resampled balanced accuracy: 0.50000


In [27]:
regr1 = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr2 = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr1.fit(x_data_train, y_data_temperature_train1)
prediced1 = regr1.predict(x_data_train)
regr2.fit(prediced1.reshape(-1, 1), y_data_temperature_train2)

prediced2 = regr1.predict(x_data_test)
err = abs(regr2.predict(prediced2.reshape(-1, 1)) - y_data_temperature_test2)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 3.64 RMSE: 5.54


Można jeszcze sprawdzić wyniki jakby uczyć go na podstawie prawdziwych danych a nie predykcji

# Study issue VI

In [2]:
# Dlugo liczy
data_by_seasons = get_train_and_test_data_by_season('./data', amount_of_days=4, wind_border=8, convert_str_variable_flag=True)

In [5]:
data_train0, data_train0 = data_by_seasons[0]
x_data_train0, y_data_wind_train0, y_data_temperature_train0 = data_train0
x_data_test0, y_data_wind_test0, y_data_temperature_test0 = data_train0

data_train1, data_train1 = data_by_seasons[1]
x_data_train1, y_data_wind_train1, y_data_temperature_train1 = data_train1
x_data_test1, y_data_wind_test1, y_data_temperature_test1 = data_train1

data_train2, data_train2 = data_by_seasons[2]
x_data_train2, y_data_wind_train2, y_data_temperature_train2 = data_train2
x_data_test2, y_data_wind_test2, y_data_temperature_test2 = data_train2

data_train3, data_train3 = data_by_seasons[3]
x_data_train3, y_data_wind_train3, y_data_temperature_train3 = data_train3
x_data_test3, y_data_wind_test3, y_data_temperature_test3 = data_train3

### Spring

In [11]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train0, y_data_wind_train0)
predicted = clf.predict(x_data_test0)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test0, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test0, predicted)))

Accuracy: 0.33882
Resampled balanced accuracy: 0.53363


In [12]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train0, y_data_temperature_train0)
err = abs(regr.predict(x_data_test0) - y_data_temperature_test0)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 4.2 RMSE: 6.03


### Summer

In [13]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train1, y_data_wind_train1)
predicted = clf.predict(x_data_test1)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test1, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test1, predicted)))

Accuracy: 0.78888
Resampled balanced accuracy: 0.59835


In [14]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train1, y_data_temperature_train1)
err = abs(regr.predict(x_data_test1) - y_data_temperature_test1)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 5.15 RMSE: 7.05


### Fall

In [16]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train2, y_data_wind_train2)
predicted = clf.predict(x_data_test2)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test2, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test2, predicted)))

Accuracy: 0.77613
Resampled balanced accuracy: 0.50000


In [17]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train2, y_data_temperature_train2)
err = abs(regr.predict(x_data_test2) - y_data_temperature_test2)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 4.24 RMSE: 5.83




### Winter

In [18]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100, 20), random_state=1)
clf.fit(x_data_train3, y_data_wind_train3)
predicted = clf.predict(x_data_test3)
print("Accuracy: {:.5f}".format(accuracy_score(y_data_wind_test3, predicted)))
print("Resampled balanced accuracy: {:.5f}".format(balanced_accuracy_score(y_data_wind_test3, predicted)))

Accuracy: 0.73451
Resampled balanced accuracy: 0.52134


In [19]:
regr = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(100, 20), random_state=1)
regr.fit(x_data_train3, y_data_temperature_train3)
err = abs(regr.predict(x_data_test3) - y_data_temperature_test3)
MAE = round(np.mean(err),2)
RMSE = round(np.sqrt(((err)**2).mean()),2)
print("MAE:", MAE, "RMSE:", RMSE)

MAE: 3.78 RMSE: 4.84


# Study issue VII


In [None]:
data_train, dest_test = get_train_and_test_data('./data', amount_of_days=3, wind_border=8, convert_str_variable_flag=True)
x_data_train, y_data_wind_train, y_data_temperature_train = data_train
x_data_test, y_data_wind_test, y_data_temperature_test = data_train

In [20]:
# mlp for combined regression and classification predictions on the abalone dataset
from numpy import unique
from numpy import argmax
from pandas import read_csv
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import plot_model

In [32]:
# https://machinelearningmastery.com/neural-network-models-for-combined-classification-and-regression/

# x_data_train, y_data_wind_train, y_data_temperature_train = data_train

visible = Input(shape=(x_data_train.shape[1],))
hidden1 = Dense(20, activation='relu', kernel_initializer='he_normal')(visible)
hidden2 = Dense(10, activation='relu', kernel_initializer='he_normal')(hidden1)
# regression output
out_reg = Dense(1, activation='linear')(hidden2)
# classification output
out_clas = Dense(len(unique(y_data_wind_train)), activation='softmax')(hidden2)
# define model
model = Model(inputs=visible, outputs=[out_reg, out_clas])
# compile the keras model
model.compile(loss=['mse','sparse_categorical_crossentropy'], optimizer='adam')

In [33]:
# fit the keras model on the dataset
model.fit(x_data_train, [y_data_temperature_train, y_data_wind_train], epochs=15, batch_size=32, verbose=2)

Epoch 1/15
1376/1376 - 1s - loss: 1295.9841 - dense_6_loss: 1283.7991 - dense_7_loss: 12.1849
Epoch 2/15
1376/1376 - 0s - loss: 64.6328 - dense_6_loss: 64.2048 - dense_7_loss: 0.4280
Epoch 3/15
1376/1376 - 0s - loss: 59.1674 - dense_6_loss: 58.7390 - dense_7_loss: 0.4284
Epoch 4/15
1376/1376 - 0s - loss: 57.6984 - dense_6_loss: 57.2666 - dense_7_loss: 0.4319
Epoch 5/15
1376/1376 - 0s - loss: 56.2536 - dense_6_loss: 55.8244 - dense_7_loss: 0.4292
Epoch 6/15
1376/1376 - 0s - loss: 55.4888 - dense_6_loss: 55.0570 - dense_7_loss: 0.4318
Epoch 7/15
1376/1376 - 0s - loss: 55.8640 - dense_6_loss: 55.4351 - dense_7_loss: 0.4288
Epoch 8/15
1376/1376 - 0s - loss: 54.6507 - dense_6_loss: 54.2221 - dense_7_loss: 0.4287
Epoch 9/15
1376/1376 - 0s - loss: 54.3652 - dense_6_loss: 53.9368 - dense_7_loss: 0.4284
Epoch 10/15
1376/1376 - 0s - loss: 53.4865 - dense_6_loss: 53.0545 - dense_7_loss: 0.4320
Epoch 11/15
1376/1376 - 0s - loss: 53.4612 - dense_6_loss: 53.0262 - dense_7_loss: 0.4349
Epoch 12/15
13

<keras.callbacks.History at 0x24a12c6c3a0>

# Test

In [123]:
data_train = dict()
data_test = dict()
data_path = './data'
file_names = ["humidity", "pressure", "temperature", "weather_description", "wind_direction", "wind_speed"]

for file_name in file_names:
    data_train[file_name] = pd.read_csv(f'{data_path}/train/{file_name}_train.csv', header=0, sep=';') 
    data_test[file_name] = pd.read_csv(f'{data_path}/test/{file_name}_test.csv', header=0, sep=';') 

In [105]:
def remove_null_values(data_dict):
  for i in data_dict:
    data_dict[i] = data_dict[i].dropna()
  return data_dict

In [97]:
def fill_null_values(data_dict):
  for i in data_dict:
    data_dict[i] = data_dict[i].ffill()
    data_dict[i] = data_dict[i].bfill()
  return data_dict

In [124]:
def fill_null_values_with_mean(df):
    for date in np.unique(df["datetime"].str[:10]):
        df_tmp = df[df["datetime"].str.startswith(date)]
        df[df["datetime"].str.startswith(date)] = df_tmp.fillna(df_tmp.mean())
    df = df.dropna()

def fill_null_values_with_means(data_dict):
    for i in data_dict:
        data_dict[i] = fill_null_values_with_mean(data_dict[i])
    return data_dict

In [126]:
data_train = fill_null_values_with_means(data_train)

  df[df["datetime"].str.startswith(date)] = df_tmp.fillna(df_tmp.mean())


In [117]:
len(df[df.isnull().any(axis=1)])

264

In [119]:
len(df)

36516

In [50]:
df = df.ffill()
df = df.bfill()

In [110]:
df[df.isnull().any(axis=1)]

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,01.10.2012 12:00,,,,,,,,,,...,,,,,,,25.0,,,
25,02.10.2012 13:00,90.0,71.0,64.0,67.0,88.0,73.0,18.0,31.0,47.0,...,59.0,51.0,,68.0,75.0,69.0,29.0,51.0,51.0,50.0
26,02.10.2012 14:00,91.0,71.0,63.0,67.0,88.0,73.0,18.0,32.0,47.0,...,59.0,51.0,,68.0,76.0,69.0,29.0,51.0,51.0,50.0
28,02.10.2012 16:00,76.0,62.0,56.0,76.0,23.0,22.0,23.0,28.0,40.0,...,52.0,,,64.0,80.0,66.0,28.0,51.0,51.0,50.0
29,02.10.2012 17:00,76.0,75.0,94.0,58.0,19.0,15.0,18.0,,,...,43.0,,,64.0,83.0,74.0,36.0,51.0,51.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36405,26.11.2016 09:00,87.0,81.0,96.0,100.0,81.0,19.0,41.0,22.0,63.0,...,70.0,81.0,100.0,100.0,52.0,,,7.0,7.0,
36406,26.11.2016 10:00,87.0,87.0,96.0,93.0,87.0,23.0,47.0,25.0,68.0,...,80.0,81.0,100.0,93.0,52.0,,5.0,6.0,6.0,
36407,26.11.2016 11:00,87.0,93.0,91.0,100.0,81.0,72.0,38.0,22.0,63.0,...,86.0,87.0,100.0,100.0,42.0,,6.0,6.0,6.0,
36408,26.11.2016 12:00,93.0,93.0,66.0,100.0,66.0,61.0,40.0,57.0,73.0,...,100.0,81.0,98.0,93.0,42.0,13.0,7.0,6.0,6.0,


In [84]:
df.columns[df.isnull().any()].tolist()

['Vancouver',
 'San Diego',
 'Phoenix',
 'Denver',
 'Dallas',
 'Minneapolis',
 'Chicago',
 'Nashville',
 'Jacksonville',
 'Miami',
 'Pittsburgh',
 'Montreal']

In [116]:
for date in np.unique(df["datetime"].str[:10]):
    df_tmp = df[df["datetime"].str.startswith(date)]
    df[df["datetime"].str.startswith(date)] = df_tmp.fillna(df_tmp.mean())
df = df.dropna()

  df[df["datetime"].str.startswith(date)] = df_tmp.fillna(df_tmp.mean())


In [80]:
'18.10.2012' in np.unique(df["datetime"].str[:10]) 

True

In [15]:
df = data_train1["humidity"]

In [16]:
df.head(3)

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
1,01.10.2012 13:00,76.0,81.0,88.0,81.0,88.0,82.0,22.0,23.0,50.0,...,71.0,58.0,93.0,68.0,50.0,63.0,22.0,51.0,51.0,50.0
2,01.10.2012 14:00,76.0,80.0,87.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,91.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0
3,01.10.2012 15:00,76.0,80.0,86.0,80.0,88.0,81.0,21.0,23.0,49.0,...,70.0,57.0,87.0,68.0,51.0,62.0,22.0,51.0,51.0,50.0


In [73]:
np.unique(df["datetime"].str[:10])

array(['01.01.2013', '01.01.2014', '01.01.2015', ..., '31.12.2013',
       '31.12.2014', '31.12.2015'], dtype=object)

In [78]:
df1 = df[df["datetime"].str.startswith('18.10.2012')]
df1[df1.isnull().any(axis=1)]

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
396,18.10.2012 00:00,78.454545,44.0,68.0,50.0,36.0,39.0,23.0,13.0,11.0,...,50.0,50.0,,75.0,88.0,78.0,18.0,62.0,62.0,53.0
397,18.10.2012 01:00,58.0,62.0,68.0,58.0,34.0,47.0,26.0,15.0,13.0,...,54.0,47.0,,86.0,90.0,73.0,53.0,62.0,62.0,53.0
398,18.10.2012 02:00,70.0,66.0,72.0,62.0,49.0,44.0,29.0,18.0,17.0,...,62.0,50.0,,67.45,94.0,73.0,49.0,62.0,62.0,53.0
399,18.10.2012 03:00,76.0,71.0,77.0,70.0,68.0,49.0,36.0,22.0,14.0,...,66.0,54.0,,67.45,89.0,77.0,47.0,62.0,62.0,53.0
400,18.10.2012 04:00,81.0,76.0,72.0,76.0,73.0,68.0,38.0,24.0,14.0,...,76.0,53.0,,67.45,95.0,71.916667,50.0,62.0,62.0,53.0
401,18.10.2012 05:00,76.0,76.0,72.0,76.0,73.0,77.0,46.0,25.0,18.0,...,70.0,57.0,,80.0,96.0,71.916667,50.0,62.0,62.0,53.0
402,18.10.2012 06:00,78.454545,76.0,72.0,81.0,73.0,73.0,49.0,35.0,20.0,...,70.0,65.0,,86.0,70.0,69.0,47.0,54.0,54.0,37.0
403,18.10.2012 07:00,76.0,93.0,72.0,76.0,77.0,93.0,49.0,35.0,20.0,...,70.0,70.0,,86.0,55.0,65.0,35.0,54.0,54.0,37.0
404,18.10.2012 08:00,76.0,93.0,72.0,77.0,82.0,88.0,49.0,40.0,22.0,...,75.0,70.0,,92.0,38.0,61.0,32.0,54.0,54.0,37.0
405,18.10.2012 09:00,81.0,93.0,77.0,86.0,82.0,88.0,33.0,37.0,22.0,...,93.0,70.0,,67.45,19.0,71.916667,30.0,54.0,54.0,37.0


In [19]:
np.unique(df["datetime"].str[:10])

array(['01.01.2013', '01.01.2014', '01.01.2015', ..., '31.12.2013',
       '31.12.2014', '31.12.2015'], dtype=object)

In [20]:
df = df[df["datetime"].str.startswith('01.01.2013')].fillna(df[df["datetime"].str.startswith('01.01.2013')].mean())

  df = df[df["datetime"].str.startswith('01.01.2013')].fillna(df[df["datetime"].str.startswith('01.01.2013')].mean())


In [21]:
df = data_train1["weather_description"]

In [22]:
df

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
1,01.10.2012 13:00,mist,scattered clouds,light rain,sky is clear,mist,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,overcast clouds,sky is clear,sky is clear,sky is clear,haze,sky is clear,sky is clear,sky is clear
2,01.10.2012 14:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,sky is clear,sky is clear,broken clouds,overcast clouds,sky is clear,overcast clouds
3,01.10.2012 15:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
4,01.10.2012 16:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
5,01.10.2012 17:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36511,30.11.2016 19:00,fog,moderate rain,mist,light rain,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,mist,mist,fog,mist,sky is clear,scattered clouds,scattered clouds,sky is clear,sky is clear,sky is clear
36512,30.11.2016 20:00,light intensity shower rain,overcast clouds,light rain,sky is clear,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,drizzle,fog,light intensity drizzle,sky is clear,sky is clear,scattered clouds,sky is clear,sky is clear,few clouds
36513,30.11.2016 21:00,light intensity shower rain,light rain,light intensity drizzle,sky is clear,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,mist,fog,mist,sky is clear,sky is clear,few clouds,moderate rain,moderate rain,sky is clear
36514,30.11.2016 22:00,mist,light rain,mist,light rain,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,fog,mist,mist,sky is clear,few clouds,few clouds,moderate rain,moderate rain,few clouds


In [23]:
df.ffill()
df.bfill()

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
1,01.10.2012 13:00,mist,scattered clouds,light rain,sky is clear,mist,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,overcast clouds,sky is clear,sky is clear,sky is clear,haze,sky is clear,sky is clear,sky is clear
2,01.10.2012 14:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,sky is clear,sky is clear,broken clouds,overcast clouds,sky is clear,overcast clouds
3,01.10.2012 15:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
4,01.10.2012 16:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
5,01.10.2012 17:00,broken clouds,scattered clouds,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,broken clouds,few clouds,sky is clear,few clouds,overcast clouds,sky is clear,broken clouds,overcast clouds,overcast clouds,overcast clouds
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36511,30.11.2016 19:00,fog,moderate rain,mist,light rain,sky is clear,sky is clear,sky is clear,sky is clear,sky is clear,...,mist,mist,fog,mist,sky is clear,scattered clouds,scattered clouds,sky is clear,sky is clear,sky is clear
36512,30.11.2016 20:00,light intensity shower rain,overcast clouds,light rain,sky is clear,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,drizzle,fog,light intensity drizzle,sky is clear,sky is clear,scattered clouds,sky is clear,sky is clear,few clouds
36513,30.11.2016 21:00,light intensity shower rain,light rain,light intensity drizzle,sky is clear,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,mist,fog,mist,sky is clear,sky is clear,few clouds,moderate rain,moderate rain,sky is clear
36514,30.11.2016 22:00,mist,light rain,mist,light rain,sky is clear,sky is clear,few clouds,sky is clear,sky is clear,...,fog,fog,mist,mist,sky is clear,few clouds,few clouds,moderate rain,moderate rain,few clouds


In [24]:
for date in np.unique(df["datetime"].str[:10]):
    df = df[df["datetime"].str.startswith(date)].fillna(df[df["datetime"].str.startswith('01.01.2013')].mean())

  df = df[df["datetime"].str.startswith(date)].fillna(df[df["datetime"].str.startswith('01.01.2013')].mean())


In [25]:
df[df.isnull().any(axis=1)]

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem


In [31]:
def fill_null_values_with_mean(data_dict):
  for i in data_dict:
    for date in np.unique(data_dict[i]["datetime"].str[:10]):
        data_dict[i][data_dict[i]["datetime"].str.startswith(date)] = data_dict[i][data_dict[i]["datetime"].str.startswith(date)].fillna(data_dict[i][data_dict[i]["datetime"].str.startswith(date)].mean())
  return data_dict

In [29]:
df = data_train["weather_description"]

In [40]:
data_train["humidity"].head(3)

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem


In [38]:
data_dict = data_train["humidity"]

In [39]:
len(data_dict)

0

In [35]:
for date in np.unique(data_dict["datetime"].str[:10]):
    data_dict[data_dict["datetime"].str.startswith(date)] = data_dict[data_dict["datetime"].str.startswith(date)].fillna(data_dict[data_dict["datetime"].str.startswith(date)].mean())

  data_dict[data_dict["datetime"].str.startswith(date)] = data_dict[data_dict["datetime"].str.startswith(date)].fillna(data_dict[data_dict["datetime"].str.startswith(date)].mean())


In [36]:
len(data_dict)

36516

In [37]:
data_dict[data_dict.isnull().any(axis=1)]

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,01.10.2012 12:00,,,,,,,,,,...,,,,,,,haze,,,
