In [1]:
import numpy as np
import pandas as pd

from data_processing import read_data

In [2]:
data_path = './data'
data_train, data_test = read_data(data_path)

humidity
pressure
temperature
weather_description
wind_direction
wind_speed


In [3]:
data_train['temperature'].head(3)


Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
1,01.10.2012 13:00,284.63,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,...,285.63,288.22,285.83,287.17,307.59,305.47,310.58,304.4,304.4,303.5
2,01.10.2012 14:00,284.629041,282.083252,289.474993,281.797217,291.868186,291.533501,293.403141,296.608509,285.154558,...,285.663208,288.247676,285.83465,287.186092,307.59,304.31,310.495769,304.4,304.4,303.5
3,01.10.2012 15:00,284.626998,282.091866,289.460618,281.789833,291.862844,291.543355,293.392177,296.631487,285.233952,...,285.756824,288.32694,285.84779,287.231672,307.391513,304.281841,310.411538,304.4,304.4,303.5


In [4]:
def combine_data_csvs(data_dict):
  col_names = ['humidity', 'pressure', 'temperature', 'weather_description', 'wind_direction', 'wind_speed']
  i = 0
  data_combined = pd.DataFrame()

  for key, df in data_dict.items():
    #assert order
    assert key.startswith(col_names[i%6])

    #transpose and melt
    df =  df.T.reset_index()
    new_header = df.iloc[0] 
    df = df[1:] 
    df.columns = new_header 
    df = df.melt(id_vars="datetime", 
        var_name="date", 
        value_name=col_names[i%6])

    #new df
    if i == 0:
      data_combined = pd.DataFrame(df.iloc[:,:-1])

    data_combined[col_names[i%6]] = df.iloc[:,-1]

    i+=1

  #rename columns
  data_combined.rename(columns={"datetime": "city"}, inplace=True)
  return data_combined

In [6]:
data_train

{'humidity':                datetime  Vancouver  Portland  San Francisco  Seattle  \
 1      01.10.2012 13:00       76.0      81.0           88.0     81.0   
 2      01.10.2012 14:00       76.0      80.0           87.0     80.0   
 3      01.10.2012 15:00       76.0      80.0           86.0     80.0   
 4      01.10.2012 16:00       77.0      80.0           85.0     79.0   
 5      01.10.2012 17:00       78.0      79.0           84.0     79.0   
 ...                 ...        ...       ...            ...      ...   
 36511  30.11.2016 19:00       93.0      81.0           72.0     81.0   
 36512  30.11.2016 20:00       87.0      81.0           67.0     81.0   
 36513  30.11.2016 21:00       93.0      81.0           87.0     71.0   
 36514  30.11.2016 22:00       81.0      81.0           81.0     70.0   
 36515  30.11.2016 23:00       81.0      71.0           76.0     71.0   
 
        Los Angeles  San Diego  Las Vegas  Phoenix  Albuquerque  ...  \
 1             88.0       82.0       2

In [8]:
data_train1 = combine_data_csvs(data_train)

In [7]:
def aggregate_by_whole_day(data_combined):
  data_combined["date"] = pd.to_datetime(data_combined["date"]).dt.strftime("%Y/%m/%d")
  data_combined["wind_speed_max"] = pd.to_numeric(data_combined["wind_speed"],downcast='unsigned')
  data_combined["wind_speed_mean"] = pd.to_numeric(data_combined["wind_speed"],downcast='unsigned')
  data_combined["humidity"] = pd.to_numeric(data_combined["humidity"],downcast='unsigned')
  data_combined["temperature"] = pd.to_numeric(data_combined["temperature"],downcast='float')
  data_combined["pressure"] = pd.to_numeric(data_combined["pressure"],downcast='unsigned')
  data_combined["wind_direction"] = pd.to_numeric(data_combined["wind_direction"],downcast='unsigned')
  data_combined["weather_description"] = data_combined["weather_description"].astype("category")
  data_aggregated = data_combined.groupby(by=["city", "date"], as_index=False).agg({'weather_description': lambda x: pd.Series.mode(x)[0],
                                                      'wind_speed_max': 'max',
                                                      'wind_speed_mean': 'mean',
                                                      "wind_direction": 'mean',
                                                      "temperature": 'mean',
                                                      "humidity": 'mean',
                                                      "pressure": 'mean'})

  data_aggregated = data_aggregated.sort_values(by=["city", "date"])

  return data_aggregated

In [9]:
aggregate_by_whole_day(data_train1)

Unnamed: 0,city,date,weather_description,wind_speed_max,wind_speed_mean,wind_direction,temperature,humidity,pressure
0,Albuquerque,2012/01/10,sky is clear,4,4.000000,360.000000,285.476196,48.909091,983.545455
1,Albuquerque,2012/01/11,sky is clear,3,3.000000,230.000000,286.820007,16.000000,1023.000000
2,Albuquerque,2012/01/12,sky is clear,2,0.500000,132.000000,278.119995,41.500000,818.250000
3,Albuquerque,2012/02/10,sky is clear,4,4.000000,360.000000,286.504242,45.400000,918.133333
4,Albuquerque,2012/02/11,sky is clear,0,0.000000,209.500000,287.464996,17.500000,1021.000000
...,...,...,...,...,...,...,...,...,...
51979,Vancouver,2016/12/07,mist,7,1.913043,164.739130,286.370422,78.304348,1026.869565
51980,Vancouver,2016/12/08,sky is clear,2,0.923077,131.153846,285.516937,66.923077,1016.000000
51981,Vancouver,2016/12/09,mist,6,2.541667,154.250000,283.194702,82.291667,1010.125000
51982,Vancouver,2016/12/10,few clouds,11,6.041667,187.541667,278.204315,96.041667,1019.708333


In [11]:
#add diff to check if days are after each other
def flatten_into_3days_timeseries(data_aggregated):
  data_aggregated.date = pd.to_datetime(data_aggregated.date, yearfirst=True)
  data_aggregated["date_diff"] = data_aggregated.date.diff()

  df1 = data_aggregated[:-2].reset_index(drop=True)
  df2 = data_aggregated[1:-1].reset_index(drop=True)
  df3 = data_aggregated[2:].reset_index(drop=True)

  y_temp = data_aggregated[["temperature","date", "city"]]
  y_temp = y_temp[4:len(df1)].reset_index(drop=True)

  y_wind = data_aggregated[["wind_speed_max","date","city"]]

  y_wind = y_wind[4:len(df1)].reset_index(drop=True)

  data_flattened = pd.concat([df1, df2, df3, y_temp, y_wind], axis=1)

  def check_date(diffs, dates):
    for d in diffs[1:]:
      if d != pd.Timedelta("1 days"):
        return True
    if dates[-1]-dates[-3] != pd.Timedelta("2 days") or dates[-2]-dates[-3] != pd.Timedelta("2 days"):
      return True
    return False

  data_flattened["to_del"] = data_flattened.apply(lambda row: len(set(row["city"])) != 1 or check_date(row["date_diff"], row["date"]), axis=1)
  data_flattened = data_flattened[~data_flattened["to_del"]].reset_index(drop=True)

  cities = data_flattened['city'].iloc[:, 0]
  col_names = ['city', 'date', 'date_diff', 'to_del']
  data_flattened = data_flattened.drop(col_names, axis=1)

  frames = [cities, data_flattened]
  data_flattened = pd.concat(frames, axis=1)
  data_flattened.columns = [
                              'city',
                              'weather_description_day1',
                              'wind_speed_max_day1',
                              'wind_speed_mean_day1',
                              'wind_direction_day1',
                              'temperature_day1',
                              'humidity_day1',
                              'pressure_day1',
                              'weather_description_day2',
                              'wind_speed_max_day2',
                              'wind_speed_mean_day2',
                              'wind_direction_day2',
                              'temperature_day2',
                              'humidity_day2',
                              'pressure_day2',
                              'weather_description_day3',
                              'wind_speed_max_day3',
                              'wind_speed_mean_day3',
                              'wind_direction_day3',
                              'temperature_day3',
                              'humidity_day3',
                              'pressure_day3',
                              'y_temperature',
                              'y_wind_speed']

  return data_flattened

In [12]:
def flatten_into_4days_timeseries(data_aggregated):
  data_aggregated.date = pd.to_datetime(data_aggregated.date, yearfirst=True)
  data_aggregated["date_diff"] = data_aggregated.date.diff()

  df1 = data_aggregated[:-3].reset_index(drop=True)
  df2 = data_aggregated[1:-2].reset_index(drop=True)
  df3 = data_aggregated[2:-1].reset_index(drop=True)
  df4 = data_aggregated[3:].reset_index(drop=True)

  y_temp = data_aggregated[["temperature","date", "city"]]
  y_temp = y_temp[4:len(df1)].reset_index(drop=True)

  y_wind = data_aggregated[["wind_speed_max","date","city"]]

  y_wind = y_wind[4:len(df1)].reset_index(drop=True)

  data_flattened = pd.concat([df1, df2, df3, df4, y_temp, y_wind], axis=1)

  def check_date(diffs, dates):
    for d in diffs[1:]:
      if d != pd.Timedelta("1 days"):
        return True
    if dates[-1]-dates[-4] != pd.Timedelta("2 days") or dates[-2]-dates[-4] != pd.Timedelta("2 days"):
      return True
    return False

  data_flattened["to_del"] = data_flattened.apply(lambda row: len(set(row["city"])) != 1 or check_date(row["date_diff"], row["date"]), axis=1)
  data_flattened = data_flattened[~data_flattened["to_del"]].reset_index(drop=True)

  cities = data_flattened['city'].iloc[:, 0]
  col_names = ['city', 'date', 'date_diff', 'to_del']
  data_flattened = data_flattened.drop(col_names, axis=1)

  frames = [cities, data_flattened]
  data_flattened = pd.concat(frames, axis=1)
  data_flattened.columns = [
                              'city',
                              'weather_description_day1',
                              'wind_speed_max_day1',
                              'wind_speed_mean_day1',
                              'wind_direction_day1',
                              'temperature_day1',
                              'humidity_day1',
                              'pressure_day1',
                              'weather_description_day2',
                              'wind_speed_max_day2',
                              'wind_speed_mean_day2',
                              'wind_direction_day2',
                              'temperature_day2',
                              'humidity_day2',
                              'pressure_day2',
                              'weather_description_day3',
                              'wind_speed_max_day3',
                              'wind_speed_mean_day3',
                              'wind_direction_day3',
                              'temperature_day3',
                              'humidity_day3',
                              'pressure_day3',
                              'y_weather_description_day4',
                              'y_wind_speed_max_day4',
                              'y_wind_speed_mean_day4',
                              'y_wind_direction_day4',
                              'y_temperature_day4',
                              'y_humidity_day4',
                              'y_pressure_day4',
                              'y_temperature',
                              'y_wind_speed']

  return data_flattened

In [13]:
def categorize_one_hot_flattend_data(flattened_data):
  bins = [-np.inf, 8, np.inf]
  names = ['below_8', 'above_8']

  flattened_data['y_wind_speed'] = pd.cut(flattened_data['y_wind_speed'], bins, labels=names, right=False)
  flattened_data = pd.get_dummies(data=flattened_data, columns=["y_wind_speed"],drop_first=True)
  return flattened_data

In [14]:
#prepare x and y
def get_x_and_ys(categorised_data):
  x = categorised_data.drop(['y_temperature', 'y_wind_speed_above_8'],axis=1)
  y_wind = categorised_data[ 'y_wind_speed_above_8']
  y_temperature = categorised_data['y_temperature']
  return x, y_wind, y_temperature

In [16]:
def get_train_data(data_train):
  data_train = combine_data_csvs(data_train)
  data_train_agg = aggregate_by_whole_day(data_train)
  data_train_flattened = flatten_into_3days_timeseries(data_train_agg)
  data_train_cat = categorize_one_hot_flattend_data(data_train_flattened)
  x_data_train, y_data_wind_train, y_data_temperature_train = get_x_and_ys(data_train_cat)
  
  return x_data_train, y_data_wind_train, y_data_temperature_train

In [17]:
x_data_train, y_data_wind_train, y_data_temperature_train = get_train_data(data_train)

In [None]:
def convert_city_name(data):
    names = np.unique(data['city'])
    rename_dict = dict()
    i = 0
    for name in names:
        rename_dict[name] = i
        i+=1

In [18]:
x_data_train

Unnamed: 0,city,weather_description_day1,wind_speed_max_day1,wind_speed_mean_day1,wind_direction_day1,temperature_day1,humidity_day1,pressure_day1,weather_description_day2,wind_speed_max_day2,...,temperature_day2,humidity_day2,pressure_day2,weather_description_day3,wind_speed_max_day3,wind_speed_mean_day3,wind_direction_day3,temperature_day3,humidity_day3,pressure_day3
0,Albuquerque,sky is clear,0,0.000000,0.000000,280.174988,26.000000,1018.500000,sky is clear,2,...,287.573334,28.500000,1015.000000,sky is clear,3,1.500000,144.500000,287.789978,22.500000,1016.000000
1,Albuquerque,sky is clear,2,0.500000,65.000000,287.573334,28.500000,1015.000000,sky is clear,3,...,287.789978,22.500000,1016.000000,few clouds,0,0.000000,0.000000,290.839996,47.000000,1016.000000
2,Albuquerque,sky is clear,5,2.125000,231.875000,291.954376,13.875000,1013.625000,sky is clear,4,...,288.856110,18.222222,1023.444444,sky is clear,4,3.000000,268.000000,298.355011,20.750000,1021.750000
3,Albuquerque,sky is clear,4,2.555556,176.111111,288.856110,18.222222,1023.444444,sky is clear,4,...,298.355011,20.750000,1021.750000,scattered clouds,6,2.750000,249.250000,296.468750,29.625000,1023.250000
4,Albuquerque,few clouds,2,1.500000,65.000000,278.727509,24.750000,1018.500000,sky is clear,7,...,282.040009,19.000000,820.000000,sky is clear,7,7.000000,275.000000,296.100006,10.000000,820.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44021,Vancouver,moderate rain,3,1.541667,136.166667,283.350037,93.541667,1021.708333,light rain,3,...,278.522705,89.125000,1014.125000,overcast clouds,2,0.750000,167.458333,286.853271,83.250000,996.416667
44022,Vancouver,light rain,3,1.875000,190.541667,278.522705,89.125000,1014.125000,overcast clouds,2,...,286.853271,83.250000,996.416667,sky is clear,7,2.625000,188.500000,289.182831,77.875000,1007.708333
44023,Vancouver,overcast clouds,2,0.750000,167.458333,286.853271,83.250000,996.416667,sky is clear,7,...,289.182831,77.875000,1007.708333,sky is clear,6,3.416667,178.333333,290.326660,75.666667,1023.000000
44024,Vancouver,sky is clear,7,2.625000,188.500000,289.182831,77.875000,1007.708333,sky is clear,6,...,290.326660,75.666667,1023.000000,light intensity shower rain,4,2.875000,190.833333,291.957916,69.250000,1008.166667


In [19]:
y_data_wind_train

0        0
1        0
2        1
3        0
4        0
        ..
44021    0
44022    0
44023    0
44024    0
44025    0
Name: y_wind_speed_above_8, Length: 44026, dtype: uint8

In [20]:
y_data_temperature_train

0        297.851685
1        300.380005
2        290.529999
3        287.980011
4        280.201050
            ...    
44021    290.326660
44022    291.957916
44023    286.370422
44024    285.516937
44025    283.194702
Name: y_temperature, Length: 44026, dtype: float32

In [21]:
from sklearn.neural_network import MLPRegressor, MLPClassifier

In [22]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)

In [23]:
clf.fit(x_data_train, y_data_wind_train)

ValueError: could not convert string to float: 'Albuquerque'