In [73]:
import numpy as np
import pandas as pd
import datetime

In [74]:
traffic_df = pd.read_excel('./../../../Databases/data.xlsx')
bcn_meteo_df = pd.read_csv('./../../../Databases/X4Barcelona_converted.csv')
can_meteo_df = pd.read_csv('./../../../Databases/XUCanyelles_converted.csv')

### Traffic data

In [75]:
traffic_df.head()

Unnamed: 0,Id,SiteId,Timestamp,TotalEntries,TotalOuts,TotalAccesses,TotalEntriesMen,TotalEntriesWoman
0,35320,1,1543276800,0,0,0,0.0,0.0
1,35517,1,1543278600,0,0,0,0.0,0.0
2,35201,1,1543280400,0,0,0,0.0,0.0
3,34526,1,1543282200,0,0,0,0.0,0.0
4,34878,1,1543284000,0,0,0,0.0,0.0


In [76]:
traffic_df.dtypes

Id                     int64
SiteId                 int64
Timestamp              int64
TotalEntries           int64
TotalOuts              int64
TotalAccesses          int64
TotalEntriesMen      float64
TotalEntriesWoman    float64
dtype: object

In [77]:
# Lets sort the data by Timestamp, and transform the column to a readable format.

traffic_df.sort_values(['Timestamp'], ascending=True, inplace=True)
traffic_df['Timestamp'] = pd.to_datetime(traffic_df['Timestamp'], unit='s')
traffic_df.head()

Unnamed: 0,Id,SiteId,Timestamp,TotalEntries,TotalOuts,TotalAccesses,TotalEntriesMen,TotalEntriesWoman
4320,2305,2,2017-01-04 00:00:00,0,0,0,0.0,0.0
4321,2306,2,2017-01-04 00:30:00,0,0,0,0.0,0.0
4322,2307,2,2017-01-04 01:00:00,0,0,0,0.0,0.0
4323,2308,2,2017-01-04 01:30:00,0,0,0,0.0,0.0
4324,2309,2,2017-01-04 02:00:00,0,0,0,0.0,0.0


In [78]:
traffic_df.dtypes

Id                            int64
SiteId                        int64
Timestamp            datetime64[ns]
TotalEntries                  int64
TotalOuts                     int64
TotalAccesses                 int64
TotalEntriesMen             float64
TotalEntriesWoman           float64
dtype: object

### Meteo data: Barcelona & Canyelles

In [79]:
bcn_meteo_df.head()

Unnamed: 0,EMA,DATA,T,TX,TN,PPT,Timestamp
0,X4,2017-01-01 00:00:00,10.4,10.5,10.4,0.0,2016-12-31 23:00:00
1,X4,2017-01-01 00:30:00,10.3,10.4,10.2,0.0,2016-12-31 23:30:00
2,X4,2017-01-01 01:00:00,10.1,10.2,9.9,0.0,2017-01-01 00:00:00
3,X4,2017-01-01 01:30:00,10.0,10.2,9.9,0.0,2017-01-01 00:30:00
4,X4,2017-01-01 02:00:00,10.0,10.1,9.8,0.0,2017-01-01 01:00:00


In [80]:
can_meteo_df.head()

Unnamed: 0,EMA,DATA,T,TX,TN,PPT,Timestamp
0,XU,2017-01-01 00:00:00,2.5,2.6,2.4,0.0,2016-12-31 23:00:00
1,XU,2017-01-01 00:30:00,2.2,2.6,2.0,0.0,2016-12-31 23:30:00
2,XU,2017-01-01 01:00:00,1.9,2.0,1.8,0.0,2017-01-01 00:00:00
3,XU,2017-01-01 01:30:00,1.6,1.8,1.4,0.0,2017-01-01 00:30:00
4,XU,2017-01-01 02:00:00,1.4,1.5,1.3,0.0,2017-01-01 01:00:00


In [81]:
bcn_meteo_df.dtypes, can_meteo_df.dtypes

(EMA           object
 DATA          object
 T            float64
 TX           float64
 TN           float64
 PPT          float64
 Timestamp     object
 dtype: object, EMA           object
 DATA          object
 T            float64
 TX           float64
 TN           float64
 PPT          float64
 Timestamp     object
 dtype: object)

In [82]:
bcn_meteo_df['Timestamp'] = pd.to_datetime(bcn_meteo_df['Timestamp'], dayfirst=True)
can_meteo_df['Timestamp'] = pd.to_datetime(can_meteo_df['Timestamp'], dayfirst=True)
bcn_meteo_df.dtypes, can_meteo_df.dtypes

(EMA                  object
 DATA                 object
 T                   float64
 TX                  float64
 TN                  float64
 PPT                 float64
 Timestamp    datetime64[ns]
 dtype: object, EMA                  object
 DATA                 object
 T                   float64
 TX                  float64
 TN                  float64
 PPT                 float64
 Timestamp    datetime64[ns]
 dtype: object)

### Creating a dataframe with all the data

We want to merge both meteo dataframes with the main one (traffic). There are 3 SiteIds on the traffic dataset (1 & 2 of Barcelona and 3 of Vilanova i la Geltrú). Our joining keys will be datetime and SiteId.

In [83]:
bcn_meteo_df['SiteId'] = 2

In [84]:
bcn_meteo_df.head()

Unnamed: 0,EMA,DATA,T,TX,TN,PPT,Timestamp,SiteId
0,X4,2017-01-01 00:00:00,10.4,10.5,10.4,0.0,2016-12-31 23:00:00,2
1,X4,2017-01-01 00:30:00,10.3,10.4,10.2,0.0,2016-12-31 23:30:00,2
2,X4,2017-01-01 01:00:00,10.1,10.2,9.9,0.0,2017-01-01 00:00:00,2
3,X4,2017-01-01 01:30:00,10.0,10.2,9.9,0.0,2017-01-01 00:30:00,2
4,X4,2017-01-01 02:00:00,10.0,10.1,9.8,0.0,2017-01-01 01:00:00,2


In [85]:
all_data_df = traffic_df.merge(bcn_meteo_df.drop(['DATA', 'EMA', 'TX', 'TN'], axis=1), how= 'left', 
        on=['Timestamp', 'SiteId'])

In [86]:
all_data_df.head()

Unnamed: 0,Id,SiteId,Timestamp,TotalEntries,TotalOuts,TotalAccesses,TotalEntriesMen,TotalEntriesWoman,T,PPT
0,2305,2,2017-01-04 00:00:00,0,0,0,0.0,0.0,10.7,0.0
1,2306,2,2017-01-04 00:30:00,0,0,0,0.0,0.0,10.8,0.0
2,2307,2,2017-01-04 01:00:00,0,0,0,0.0,0.0,10.9,0.0
3,2308,2,2017-01-04 01:30:00,0,0,0,0.0,0.0,11.0,0.0
4,2309,2,2017-01-04 02:00:00,0,0,0,0.0,0.0,11.1,0.0


In [87]:
# Lets check that the merge has been done as expected printing the dimensions:

traffic_df.shape, all_data_df.shape

((45216, 8), (45216, 10))

In [88]:
# Another hint of a good merge will be to find NaNs of DATA, T and PPT on the other SiteIds:

all_data_df.isnull().sum()

Id                      0
SiteId                  0
Timestamp               0
TotalEntries            0
TotalOuts               0
TotalAccesses           0
TotalEntriesMen       192
TotalEntriesWoman     192
T                    7872
PPT                  7872
dtype: int64

In [89]:
# The sum of the counts of SiteIds 1 and 3 should be the same as the NaNs

all_data_df['SiteId'][all_data_df['SiteId'] == 1].count() + all_data_df['SiteId'][all_data_df['SiteId'] == 3].count()

7872

In [90]:
all_data_df[all_data_df['SiteId'] == 2].isnull().sum()

Id                   0
SiteId               0
Timestamp            0
TotalEntries         0
TotalOuts            0
TotalAccesses        0
TotalEntriesMen      0
TotalEntriesWoman    0
T                    0
PPT                  0
dtype: int64

In [91]:
bcn_meteo_df['SiteId'] = 1
all_data_df = all_data_df.merge(bcn_meteo_df.drop(['DATA', 'EMA', 'TX', 'TN'], axis=1), how= 'left', 
        on=['Timestamp', 'SiteId'])

In [92]:
all_data_df[all_data_df['SiteId'] == 1].isnull().sum()

Id                      0
SiteId                  0
Timestamp               0
TotalEntries            0
TotalOuts               0
TotalAccesses           0
TotalEntriesMen       192
TotalEntriesWoman     192
T_x                  4320
PPT_x                4320
T_y                     0
PPT_y                   0
dtype: int64

In [93]:
can_meteo_df['SiteId'] = 3
all_data_df = all_data_df.merge(can_meteo_df.drop(['DATA', 'EMA', 'TX', 'TN'], axis=1), how= 'left', 
        on=['Timestamp', 'SiteId'])

In [94]:
all_data_df.head()

Unnamed: 0,Id,SiteId,Timestamp,TotalEntries,TotalOuts,TotalAccesses,TotalEntriesMen,TotalEntriesWoman,T_x,PPT_x,T_y,PPT_y,T,PPT
0,2305,2,2017-01-04 00:00:00,0,0,0,0.0,0.0,10.7,0.0,,,,
1,2306,2,2017-01-04 00:30:00,0,0,0,0.0,0.0,10.8,0.0,,,,
2,2307,2,2017-01-04 01:00:00,0,0,0,0.0,0.0,10.9,0.0,,,,
3,2308,2,2017-01-04 01:30:00,0,0,0,0.0,0.0,11.0,0.0,,,,
4,2309,2,2017-01-04 02:00:00,0,0,0,0.0,0.0,11.1,0.0,,,,


In [95]:
# Since we have replicated the weather columns with each append, we have to reorganize the dataframe:

all_data_df['Temperature'] = all_data_df['T_x']
all_data_df['Temperature'][all_data_df['SiteId'] == 1] = all_data_df['T_y']
all_data_df['Temperature'][all_data_df['SiteId'] == 3] = all_data_df['T']
all_data_df['Precipitation'] = all_data_df['PPT_x']
all_data_df['Precipitation'][all_data_df['SiteId'] == 1] = all_data_df['PPT_y']
all_data_df['Precipitation'][all_data_df['SiteId'] == 3] = all_data_df['PPT']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [96]:
all_data_df.drop(['T_x', 'PPT_x', 'T_y', 'PPT_y', 'T', 'PPT'], axis=1, inplace=True)

In [97]:
all_data_df.isnull().sum()

Id                     0
SiteId                 0
Timestamp              0
TotalEntries           0
TotalOuts              0
TotalAccesses          0
TotalEntriesMen      192
TotalEntriesWoman    192
Temperature            0
Precipitation          0
dtype: int64

In [98]:
all_data_df.head()

Unnamed: 0,Id,SiteId,Timestamp,TotalEntries,TotalOuts,TotalAccesses,TotalEntriesMen,TotalEntriesWoman,Temperature,Precipitation
0,2305,2,2017-01-04 00:00:00,0,0,0,0.0,0.0,10.7,0.0
1,2306,2,2017-01-04 00:30:00,0,0,0,0.0,0.0,10.8,0.0
2,2307,2,2017-01-04 01:00:00,0,0,0,0.0,0.0,10.9,0.0
3,2308,2,2017-01-04 01:30:00,0,0,0,0.0,0.0,11.0,0.0
4,2309,2,2017-01-04 02:00:00,0,0,0,0.0,0.0,11.1,0.0


In [99]:
all_data_df.to_csv('./../../../Databases/clean_data.csv')