# Data_preparation

### import all relevant libraries

* install haversine ($ pip install haversine ; https://pypi.org/project/haversine/ )


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from haversine import haversine, Unit

sns.set()
sns.set_style("whitegrid")
sns.set_palette("GnBu_d")




### define readin functions:

In [4]:
def trip_data_readin (city):
    trip_data = pd.read_csv("Project_Data/Trip_Data/"+city+".csv", encoding = "ISO-8859-1")
    trip_data = trip_data_data_prep(trip_data)
    trip_data = merge_trip_data_with_weather_data(trip_data, city)
    return trip_data


def trip_data_data_prep(trip_data):
    trip_data["datetime_start"] = pd.to_datetime(trip_data['day'] + ' ' + trip_data['time'])
    trip_data["trip_duration"] = pd.to_timedelta(trip_data["trip_duration"])
    trip_data["datetime_end"] = trip_data["datetime_start"] + trip_data["trip_duration"]
    trip_data["weekday"] = pd.to_datetime(trip_data["datetime_start"]).dt.weekday
    trip_data["month"] = trip_data["datetime_start"].dt.month
    trip_data["hour" ] = trip_data["datetime_start"].dt.hour
    
    trip_data = distance_between_coordinates (trip_data)    
    return trip_data

def distance_between_coordinates (trip_data):
    trip_data["distance"] = trip_data.apply(lambda row: haversine((row["orig_lat"], row["orig_lng"]),(row["dest_lat"], row["dest_lng"]),Unit.KILOMETERS), axis=1) 
    return trip_data




def merge_trip_data_with_weather_data (trip_data, city):
    trip_data.set_index(pd.DatetimeIndex(trip_data["datetime_start"]),inplace=True)
    weather = read_DWD_data (city)
    trip_data["rounded_time_hourly"] = trip_data["datetime_start"].dt.round("H")
    combined = pd.merge(trip_data, weather , on="rounded_time_hourly", how='left')
    combined["wind"] = combined['wind'].fillna(method="ffill")
    combined["rain"] = combined['rain'].fillna(method="ffill")
    combined["temp"] = combined['temp'].fillna(method="ffill")
    return combined


def read_DWD_data (city):
    
    temp_data = pd.read_csv("Project_Data/Weather_Data/"+city+"/"+"data_TT_TU_MN009.csv", encoding = "ISO-8859-1")
    temp_data =temp_data.rename(columns={"Zeitstempel": "rounded_time_hourly"})
    temp_data.set_index((pd.to_datetime(temp_data['rounded_time_hourly'].astype(str), format='%Y%m%d%H%M') ),inplace=True)
    temp_data=temp_data.drop(['Produkt_Code', 'SDO_ID','SDO_ID','Qualitaet_Niveau','Qualitaet_Byte','rounded_time_hourly'], axis=1)
    temp_data.columns = ['temp']


    wind_data = pd.read_csv("Project_Data/Weather_Data/"+city+"/"+"data_F_MN003.csv", encoding = "ISO-8859-1")
    wind_data =wind_data.rename(columns={"Zeitstempel": "rounded_time_hourly"})
    wind_data.set_index((pd.to_datetime(wind_data['rounded_time_hourly'].astype(str), format='%Y%m%d%H%M') ),inplace=True)
    wind_data = wind_data.drop(['Produkt_Code', 'SDO_ID','SDO_ID','Qualitaet_Niveau','Qualitaet_Byte','rounded_time_hourly'], axis=1)
    wind_data.columns = ['wind']

    
    rain_data = pd.read_csv("Project_Data/Weather_Data/"+city+"/"+"data_R1_MN008.csv", encoding = "ISO-8859-1")
    rain_data =rain_data.rename(columns={"Zeitstempel": "rounded_time_hourly"})
    rain_data.set_index((pd.to_datetime(rain_data['rounded_time_hourly'].astype(str), format='%Y%m%d%H%M') ),inplace=True)
    rain_data = rain_data.drop(['Produkt_Code', 'SDO_ID','SDO_ID','Qualitaet_Niveau','Qualitaet_Byte','rounded_time_hourly'], axis=1)
    rain_data.columns = ['rain']
    
    
    weather = pd.merge(temp_data, wind_data , on="rounded_time_hourly", how='left')
    weather = pd.merge(weather, rain_data , on="rounded_time_hourly", how='left')
    return weather


# Data Cleaning

In [44]:
# deleting trips with avg. speed over 25km/h
def cleaning (trip_data)
    trip_data["trip_duration_hours"] = trip_data["trip_duration"].dt.total_seconds()/3600
    trip_data["avg_speed"] = trip_data["distance"]/trip_data["trip_duration_hours"]
    trip_data.drop(trip_data[trip_data["avg_speed"]>25].index, axis=0, inplace=True)
    return trip_data




Unnamed: 0,b_number,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,weekday,month,hour,distance,temp,wind,rain,trip_duration_hours,avg.speed,avg_speed
count,18038.0,18038,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0,18038.0
mean,30615.641091,0 days 00:27:17.556270096,51.225214,6.784543,51.225157,6.784426,3.204734,5.218539,13.373157,1.717273,19.108926,3.954829,0.027692,0.454877,5.773809,5.773809
std,6909.397025,0 days 00:32:11.834626498,0.022536,0.023901,0.022143,0.023038,1.986495,1.068599,5.058998,1.602547,7.026636,1.816669,0.311807,0.536621,3.558801,3.558801
min,538.0,0 days 00:03:00,50.677853,6.180219,50.677916,6.64477,0.0,2.0,0.0,0.000918,-1.6,0.3,0.0,0.05,0.000927,0.000927
25%,31252.0,0 days 00:09:00,51.213752,6.775237,51.213752,6.774851,1.0,5.0,10.0,0.709099,14.0,2.6,0.0,0.15,2.877966,2.877966
50%,31359.0,0 days 00:16:00,51.223149,6.784169,51.223311,6.784123,3.0,6.0,14.0,1.378615,19.3,3.8,0.0,0.266667,5.997485,5.997485
75%,31932.0,0 days 00:28:00,51.236648,6.794867,51.236648,6.794867,5.0,6.0,17.0,2.322811,24.5,5.1,0.0,0.466667,8.387293,8.387293
max,53159.0,0 days 02:59:00,51.523058,7.480777,51.523396,7.480644,6.0,6.0,23.0,43.45738,36.1,17.6,8.2,2.983333,24.703928,24.703928


Unnamed: 0,day,time,b_number,city,trip_duration,orig_lat,orig_lng,dest_lat,dest_lng,datetime_start,...,month,hour,distance,rounded_time_hourly,temp,wind,rain,trip_duration_hours,avg.speed,avg_speed
0,2019-02-06,15:35:00,538,duesseldorf,0 days 00:17:00,51.218927,6.765862,51.240644,6.781982,2019-02-06 15:35:00,...,2,15,2.662958,2019-02-06 16:00:00,9.5,5.0,0.0,0.283333,9.398675,9.398675
1,2019-02-19,15:57:00,538,duesseldorf,0 days 00:23:00,51.213766,6.751624,51.223056,6.779230,2019-02-19 15:57:00,...,2,15,2.182598,2019-02-19 16:00:00,10.3,5.6,0.0,0.383333,5.693734,5.693734
2,2019-03-22,13:20:00,538,duesseldorf,0 days 01:20:00,51.229671,6.797855,51.218527,6.794867,2019-03-22 13:20:00,...,3,13,1.256486,2019-03-22 13:00:00,19.0,3.5,0.0,1.333333,0.942365,0.942365
3,2019-03-27,06:34:00,538,duesseldorf,0 days 00:22:00,51.218527,6.794867,51.252515,6.782041,2019-03-27 06:34:00,...,3,6,3.883302,2019-03-27 07:00:00,4.8,0.5,0.0,0.366667,10.590823,10.590823
4,2019-03-28,14:54:00,538,duesseldorf,0 days 00:12:00,51.252515,6.782041,51.242592,6.782100,2019-03-28 14:54:00,...,3,14,1.103349,2019-03-28 15:00:00,12.5,1.7,0.0,0.200000,5.516746,5.516746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18059,2019-06-09,20:49:00,52817,duesseldorf,0 days 00:42:00,51.227590,6.772325,51.226898,6.771832,2019-06-09 20:49:00,...,6,20,0.084326,2019-06-09 21:00:00,17.9,3.3,0.0,0.700000,0.120466,0.120466
18060,2019-06-10,10:42:00,52817,duesseldorf,0 days 00:53:00,51.226898,6.771832,51.229461,6.777001,2019-06-10 10:42:00,...,6,10,0.459157,2019-06-10 11:00:00,22.6,1.9,0.0,0.883333,0.519800,0.519800
18061,2019-06-10,20:17:00,52817,duesseldorf,0 days 00:04:00,51.229461,6.777001,51.229505,6.776754,2019-06-10 20:17:00,...,6,20,0.017920,2019-06-10 20:00:00,17.3,3.2,0.0,0.066667,0.268800,0.268800
18062,2019-06-17,12:47:00,52817,duesseldorf,0 days 00:19:00,51.237900,6.789368,51.222489,6.813165,2019-06-17 12:47:00,...,6,12,2.383758,2019-06-17 13:00:00,26.1,2.9,0.0,0.316667,7.527656,7.527656


0        0.283333
1        0.383333
2        1.333333
3        0.366667
4        0.200000
           ...   
18059    0.700000
18060    0.883333
18061    0.066667
18062    0.316667
18063    0.233333
Name: trip_duration, Length: 18064, dtype: float64


## Create combined Dataset


In [33]:
def export (city_name):
    data_set = trip_data_readin(city_name)
    data_set.to_csv("Project_Data/Combined_Data/"+city_name+".csv") 



In [34]:
# execute only in case of new data updates #

export ("duesseldorf")
export ("bremen")


In [35]:
data_set = trip_data_readin("duesseldorf") 



In [36]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18064 entries, 0 to 18063
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype          
---  ------               --------------  -----          
 0   day                  18064 non-null  object         
 1   time                 18064 non-null  object         
 2   b_number             18064 non-null  int64          
 3   city                 18064 non-null  object         
 4   trip_duration        18064 non-null  timedelta64[ns]
 5   orig_lat             18064 non-null  float64        
 6   orig_lng             18064 non-null  float64        
 7   dest_lat             18064 non-null  float64        
 8   dest_lng             18064 non-null  float64        
 9   datetime_start       18064 non-null  datetime64[ns] 
 10  datetime_end         18064 non-null  datetime64[ns] 
 11  weekday              18064 non-null  int64          
 12  month                18064 non-null  int64          
 13  hour            

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb793ac0be0>