### Dataset Descreption 
The dataset consists in 2 Pickle files describing vehicle trips within Austria. As is common with real world data, the GPS data has missing data. The goal is to suggest methods to impute the missing data and thus improve the data granularity. Will focus on a subset of trips (and explain the method), as the imputation is not always possible. 


Import packages

In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
from geopy.geocoders import Nominatim
import geocoder
import time
from pprint import pprint
from __future__ import print_function
import requests
from requests import get
import json
import urllib
import folium 
from folium import plugins
from pandas import json_normalize
from gpx_interpolate import gpx_interpolate

### Load Data

In [2]:
np.random.seed(42)

df_point = pd.read_pickle("./GPS Data/point_at-001.pkl")


In [3]:
df_point

Unnamed: 0,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,vehicle__fuel__level,vehicle__engine__rpm,...,location__house__address,location__quarter__name,location__city__block_name,location__road__name,location__house__number,location__gps__hdop,vehicle__engine__status,location__country__code_provided,datetime_metadata__time__epoch,date
2395308,4a806ae971a51f2d8662e49a0cf1d270,1619827343000,at,,47.24070,13.42260,200.00,79.00000,,,...,,,,Tauern Autobahn,,,,,2021-05-01 02:02:23.000,"(2021, 17, 6)"
2835713,aaaf1fb53fa9405604935acd41203827,1619827398000,at,,47.37290,14.98170,100.00,89.00000,,,...,,,,Pyhrn Autobahn,,,,,2021-05-01 02:03:18.000,"(2021, 17, 6)"
988212,e542ceb979ddb05c728419ef123341bf,1619827411000,at,,48.24980,14.23680,250.00,85.00000,,,...,,,,Kremstal Straße,,,,,2021-05-01 02:03:31.000,"(2021, 17, 6)"
1051594,eed36fdd8588fbb1e7451214f22ca0c3,1619827418000,at,,47.20370,11.40330,20.00,44.00000,,,...,,,,Brenner Autobahn,,,,,2021-05-01 02:03:38.000,"(2021, 17, 6)"
833515,4c4855fe8335225985047a11badb40e5,1619827436000,at,,48.15810,16.47780,160.00,84.00000,,,...,,,,Ostautobahn,,,,,2021-05-01 02:03:56.000,"(2021, 17, 6)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980115,70904ad06993423f9637fe9e76a7dc07,1627775921000,at,,48.13854,14.95967,279.21,75.41344,,,...,,,,Westautobahn,,,2.0,,2021-08-01 01:58:41.000,"(2021, 30, 7)"
1033063,2956eadf244980f60485b50313a2db03,1627775927180,at,,48.13879,14.95784,278.41,75.99600,,0.0,...,,,,Westautobahn,,,0.0,AT,2021-08-01 01:58:47.180,"(2021, 30, 7)"
516056,abc4996a7d036b057572a7225a9a1972,1627775929393,at,,47.35756,13.39793,4.25,93.67200,,0.0,...,,,,Tauern Autobahn,,,2.0,AT,2021-08-01 01:58:49.393,"(2021, 30, 7)"
304771,c3cd1be168da4bd0468ab9254ada0b65,1627775930600,at,,46.91661,15.47115,170.99,87.01200,,0.0,...,,,,Pyhrn Autobahn,,,2.0,AT,2021-08-01 01:58:50.600,"(2021, 30, 7)"


Set an index to the Dataset 

In [4]:
df_point.reset_index(inplace=True)


#### Get an overview of the Dataset 

In [5]:
df_point.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7316674 entries, 0 to 7316673
Data columns (total 40 columns):
 #   Column                               Dtype         
---  ------                               -----         
 0   index                                int64         
 1   vehicle__identification__otonomo_id  object        
 2   metadata__time__epoch                int64         
 3   location__country__code              object        
 4   location__city__name                 object        
 5   location__latitude__value            float64       
 6   location__longitude__value           float64       
 7   mobility__heading__angle             float64       
 8   mobility__speed__value               float64       
 9   vehicle__fuel__level                 float64       
 10  vehicle__engine__rpm                 float64       
 11  location__gps__satellites_count      float64       
 12  vehicle__fuel__amount                float64       
 13  location__altitude__value  

#### After the overview : 40 columns are found, but some are the same under different names or not able to impute them by the givin data.
#### Then is better to drop them to get a clean data.

In [6]:
# drop unsed columns 
df_point = df_point.drop(['vehicle__engine__rpm', 'location__country__code_provided', 'vehicle__fuel__amount', 'manufacturer__category__value',
                          'location__country__code_provided','vehicle__fuel__level', 'location__gps__satellites_count', 'location__quarter__name',
                         'location__city__block_name', 'location__suburb__name', 'location__district__name', 'location__municipality__name',
                         'location__house__number','location__farm__name', 'location__island__name','location__region__name', 'location__gps__hdop',
                         'location__town__name', 'location__continent__name', 'vehicle__engine__status'], axis=1)

#### Convert datetime_metadata__time__epoch  from int64 to timedelta

In [7]:
df_point['metadata__time__epoch'] = pd.to_timedelta(df_point['metadata__time__epoch'], errors = 'coerce')

#### Group dataset by vehicle__identification__otonomo_id

In [8]:
df_point_grouped = df_point.groupby(df_point['vehicle__identification__otonomo_id'], axis=0, dropna=True)
df_grouped = df_point_grouped.head()
df_grouped.reset_index(inplace=True) ## reset index
df_grouped

Unnamed: 0,level_0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
0,0,2395308,4a806ae971a51f2d8662e49a0cf1d270,0 days 00:26:59.827343,at,,47.24070,13.42260,200.00,79.000,...,u23s9ryj10zb,Austria,Salzburg,Sankt Johann im Pongau,Zederhaus,,,Tauern Autobahn,2021-05-01 02:02:23.000,"(2021, 17, 6)"
1,1,2835713,aaaf1fb53fa9405604935acd41203827,0 days 00:26:59.827398,at,,47.37290,14.98170,100.00,89.000,...,u26tkzvr4ut5,Austria,Styria,Leoben,Traboch,,,Pyhrn Autobahn,2021-05-01 02:03:18.000,"(2021, 17, 6)"
2,2,988212,e542ceb979ddb05c728419ef123341bf,0 days 00:26:59.827411,at,,48.24980,14.23680,250.00,85.000,...,u2d47zwgmqxk,Austria,Upper Austria,Linz-Land,,,,Kremstal Straße,2021-05-01 02:03:31.000,"(2021, 17, 6)"
3,3,1051594,eed36fdd8588fbb1e7451214f22ca0c3,0 days 00:26:59.827418,at,,47.20370,11.40330,20.00,44.000,...,u22he3ptkpky,Austria,Tyrol,Innsbruck-Land,,,,Brenner Autobahn,2021-05-01 02:03:38.000,"(2021, 17, 6)"
4,4,833515,4c4855fe8335225985047a11badb40e5,0 days 00:26:59.827436,at,,48.15810,16.47780,160.00,84.000,...,u2e9yyyv28d0,Austria,Vienna,,,,,Ostautobahn,2021-05-01 02:03:56.000,"(2021, 17, 6)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
983097,7316640,1637931,c3cd1be168da4bd0468ab9254ada0b65,0 days 00:27:07.775751693,at,,46.95186,15.44703,148.62,86.760,...,u26gpe24mvh6,Austria,Styria,Bezirk Graz-Umgebung,Kasten,,,Pyhrn Autobahn,2021-08-01 01:55:51.693,"(2021, 30, 7)"
983098,7316651,966303,c3cd1be168da4bd0468ab9254ada0b65,0 days 00:27:07.775810927,at,,46.94067,15.45693,148.94,86.940,...,u26gp9r3hx47,Austria,Styria,Bezirk Graz-Umgebung,Kasten,,,Pyhrn Autobahn,2021-08-01 01:56:50.927,"(2021, 30, 7)"
983099,7316654,2100651,17ee9ff671076fed9c2eccb39031ee7f,0 days 00:27:07.775834,at,Gemeinde Wilfersdorf,48.62330,16.67600,350.00,84.000,...,u2eue7x56bnf,Austria,Lower Austria,Mistelbach District,,,,A5,2021-08-01 01:57:14.000,"(2021, 30, 7)"
983100,7316661,836096,c3cd1be168da4bd0468ab9254ada0b65,0 days 00:27:07.775869937,at,,46.92914,15.46593,157.01,86.940,...,u26fzzjyn3c2,Austria,Styria,Bezirk Graz-Umgebung,Kasten,,,Pyhrn Autobahn,2021-08-01 01:57:49.937,"(2021, 30, 7)"


In [9]:
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983102 entries, 0 to 983101
Data columns (total 22 columns):
 #   Column                               Non-Null Count   Dtype          
---  ------                               --------------   -----          
 0   level_0                              983102 non-null  int64          
 1   index                                983102 non-null  int64          
 2   vehicle__identification__otonomo_id  983102 non-null  object         
 3   metadata__time__epoch                983102 non-null  timedelta64[ns]
 4   location__country__code              983102 non-null  object         
 5   location__city__name                 285092 non-null  object         
 6   location__latitude__value            983102 non-null  float64        
 7   location__longitude__value           983102 non-null  float64        
 8   mobility__heading__angle             975298 non-null  float64        
 9   mobility__speed__value               983101 non-null  float

In [10]:
df_grouped.isnull().sum()

level_0                                     0
index                                       0
vehicle__identification__otonomo_id         0
metadata__time__epoch                       0
location__country__code                     0
location__city__name                   698010
location__latitude__value                   0
location__longitude__value                  0
mobility__heading__angle                 7804
mobility__speed__value                      1
location__altitude__value              829018
metadata__provider__name                    0
location__polygon__geohash                  0
location__country__name                  2112
location__state__name                   13543
location__county__name                 138158
location__village__name                482709
location__neighbourhood__name          884366
location__house__address               982154
location__road__name                    43686
datetime_metadata__time__epoch              0
date                              

In [11]:
df_grouped = df_grouped.head(100)

In [12]:
df_grouped.isnull().sum()

level_0                                  0
index                                    0
vehicle__identification__otonomo_id      0
metadata__time__epoch                    0
location__country__code                  0
location__city__name                    81
location__latitude__value                0
location__longitude__value               0
mobility__heading__angle                 1
mobility__speed__value                   0
location__altitude__value              100
metadata__provider__name                 0
location__polygon__geohash               0
location__country__name                  0
location__state__name                    0
location__county__name                   1
location__village__name                 51
location__neighbourhood__name           90
location__house__address               100
location__road__name                     3
datetime_metadata__time__epoch           0
date                                     0
dtype: int64

#### Here is a function to get the address by location.
#### The function runs over the data in columns (location__latitude__value , location__longitude__value)

In [13]:
app = Nominatim(user_agent="coordinateconverter")

def get_address_by_location(latitude, longitude, language="en"):
    """This function returns an address as raw from a location
    will repeat until success"""
    # build coordinates string to pass to reverse() function
    coordinates = f"{latitude}, {longitude}"
    # sleep for a second to respect Usage Policy
    time.sleep(1)
    try:
        location = app.reverse(coordinates, language=language)
#         print(location)
        return location.raw
    except:
        return get_address_by_location(latitude, longitude)

In [14]:
## Load the location_array from file

with open ('location_array1.txt', 'rb') as fp:
    location_array1 = pickle.load(fp)

#### Here is a function to get the altitude by location.
#### The function runs over the data's columns (location__latitude__value, location__longitude__value), based on open elevation data which in turn is based on SRTM.

In [15]:
def get_elevation(lat = None, long = None):
    '''
        script for returning elevation in m from lat, long
    '''
    if lat is None or long is None: return None
    
   
    query = ('https://api.opentopodata.org/v1/test-dataset'f'?locations={lat},{long}')
    
    # Request with a timeout for slow responses
    r = get(query, timeout = 20)
    

    # Only get the json response in case of 200 or 201
    if r.status_code == 200 or r.status_code == 201:
        elevation = json_normalize(r.json(), 'results')['elevation'].values[0]
    else: 
        elevation = None
    return elevation

In [16]:
## Load the altitude from file

with open ('altitude1.txt', 'rb') as fp:
    altitude1 = pickle.load(fp)

In [17]:
"""
The function (get_address_by_location) return back full addrees as dict.
So we will split the address to road, village, state, postcode and county and append each in a list to be used later.
"""

road=[]
village=[]
state=[]
postcode=[]
county=[]


for i in range(0,len(location_array1)):
    road.append(location_array1[i].get('road'))
    village.append(location_array1[i].get('village'))
    state.append(location_array1[i].get('state'))
    postcode.append(location_array1[i].get('postcode'))
    county.append(location_array1[i].get('county'))
    
    
"""
for loop to get all the city's names and append it to a list
"""

city=[]    

item = None    

for loc in location_array1:   
    try:
        item = loc['city']
    except KeyError:      # handel None values KeyError
        item = loc['state']
        
    city.append(item)
    
"""
for loop to creat a customize address and append all in a list to be used later 
"""  
        
house_address = []

for loc in location_array1:
    try: 
        location = loc['road'] +', '+  loc['state'] + ', '+ loc['postcode']  # full address 
    except KeyError:      # handel None values KeyError
        pass 
        
    house_address.append(location)


#### Now we can use the previous lists to creat a new DataFrame (df) contains the Dataset to be able to see it.

In [18]:
df = pd.DataFrame(road, columns =['road'])
df['village'] = pd.DataFrame({'village':village})
df['state'] = pd.DataFrame({'state':state})
df['city'] = pd.DataFrame({'city':city})
df['postcode'] = pd.DataFrame({'postcode':postcode})
df['county'] = pd.DataFrame({'county':county})
df['altitude (m)'] = pd.DataFrame({'altitude':altitude1})
df['house_address'] = pd.DataFrame({'house_address':house_address})


In [19]:
df

Unnamed: 0,road,village,state,city,postcode,county,altitude (m),house_address
0,Tauern Autobahn,Zederhaus,Salzburg,Salzburg,5542,Bezirk St. Johann im Pongau,1759.327881,"Tauern Autobahn, Salzburg, 5542"
1,Pyhrn Autobahn,Timmersdorf,Styria,Styria,8772,Bezirk Leoben,744.461365,"Pyhrn Autobahn, Styria, 8772"
2,Kremstal Straße,,Upper Austria,Upper Austria,4060,Bezirk Linz-Land,499.584839,"Kremstal Straße, Upper Austria, 4060"
3,Brenner Autobahn,Patsch,Tyrol,Gemeinde Patsch,6082,Bezirk Innsbruck-Land,1352.123535,"Brenner Autobahn, Tyrol, 6082"
4,Ostautobahn,,Vienna,Vienna,1110,,347.529144,"Ostautobahn, Vienna, 1110"
...,...,...,...,...,...,...,...,...
95,Tauern Autobahn,Molzbichl,Carinthia,Carinthia,9800,Bezirk Spittal an der Drau,1614.641479,"Tauern Autobahn, Carinthia, 9800"
96,Tauern Autobahn,Molzbichl,Carinthia,Carinthia,9800,Bezirk Spittal an der Drau,1644.702881,"Tauern Autobahn, Carinthia, 9800"
97,Tauern Autobahn,,Carinthia,Carinthia,9871,Bezirk Spittal an der Drau,1741.955322,"Tauern Autobahn, Carinthia, 9871"
98,Tauern Autobahn,,Carinthia,Carinthia,9871,Bezirk Spittal an der Drau,1738.358765,"Tauern Autobahn, Carinthia, 9871"


In [20]:
df_grouped.loc[:99, 'location__altitude__value'] = df['altitude (m)'].values  # impute 'location__altitude__value' by 'altitude' values

df_grouped.loc[:99, 'location__city__name'] = df['city'].values               # impute 'location__city__name' by 'state' values

df_grouped.loc[:99, 'location__neighbourhood__name'] = df['county'].values    # impute 'location__neighbourhood__name' by 'county' values

df_grouped.loc[:99, 'location__village__name'] = df['village'].values         # impute 'location__village__name' by 'village' values

df_grouped.loc[:99, 'location__road__name'] = df['road'].values               # impute 'location__road__name' by 'road' values

df_grouped.loc[:99, 'location__house__address'] = df['house_address'].values  # impute 'location__house__address' by 'house_address' values



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_grouped.loc[:99, 'location__altitude__value'] = df['altitude (m)'].values  # impute 'location__altitude__value' by 'altitude' values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_grouped.loc[:99, 'location__city__name'] = df['city'].values               # impute 'location__city__name' by 'state' values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable

In [21]:
df_grouped

Unnamed: 0,level_0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
0,0,2395308,4a806ae971a51f2d8662e49a0cf1d270,0 days 00:26:59.827343,at,Salzburg,47.240700,13.422600,200.0,79.0,...,u23s9ryj10zb,Austria,Salzburg,Sankt Johann im Pongau,Zederhaus,Bezirk St. Johann im Pongau,"Tauern Autobahn, Salzburg, 5542",Tauern Autobahn,2021-05-01 02:02:23,"(2021, 17, 6)"
1,1,2835713,aaaf1fb53fa9405604935acd41203827,0 days 00:26:59.827398,at,Styria,47.372900,14.981700,100.0,89.0,...,u26tkzvr4ut5,Austria,Styria,Leoben,Timmersdorf,Bezirk Leoben,"Pyhrn Autobahn, Styria, 8772",Pyhrn Autobahn,2021-05-01 02:03:18,"(2021, 17, 6)"
2,2,988212,e542ceb979ddb05c728419ef123341bf,0 days 00:26:59.827411,at,Upper Austria,48.249800,14.236800,250.0,85.0,...,u2d47zwgmqxk,Austria,Upper Austria,Linz-Land,,Bezirk Linz-Land,"Kremstal Straße, Upper Austria, 4060",Kremstal Straße,2021-05-01 02:03:31,"(2021, 17, 6)"
3,3,1051594,eed36fdd8588fbb1e7451214f22ca0c3,0 days 00:26:59.827418,at,Gemeinde Patsch,47.203700,11.403300,20.0,44.0,...,u22he3ptkpky,Austria,Tyrol,Innsbruck-Land,Patsch,Bezirk Innsbruck-Land,"Brenner Autobahn, Tyrol, 6082",Brenner Autobahn,2021-05-01 02:03:38,"(2021, 17, 6)"
4,4,833515,4c4855fe8335225985047a11badb40e5,0 days 00:26:59.827436,at,Vienna,48.158100,16.477800,160.0,84.0,...,u2e9yyyv28d0,Austria,Vienna,,,,"Ostautobahn, Vienna, 1110",Ostautobahn,2021-05-01 02:03:56,"(2021, 17, 6)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,367,2897873,779e11c4ecdd52ce3527c79041a2bfd9,0 days 00:26:59.839128,at,Carinthia,46.756551,13.567941,332.0,81.0,...,u239uxzbeuy4,Austria,Carinthia,Spittal an der Drau,Molzbichl,Bezirk Spittal an der Drau,"Tauern Autobahn, Carinthia, 9800",Tauern Autobahn,2021-05-01 05:18:48,"(2021, 17, 6)"
96,368,2623316,b893c5148f184ce61dbd98d135885a83,0 days 00:26:59.839138,at,Carinthia,46.769033,13.554904,324.0,83.0,...,u23dh6n3k5su,Austria,Carinthia,Spittal an der Drau,Molzbichl,Bezirk Spittal an der Drau,"Tauern Autobahn, Carinthia, 9800",Tauern Autobahn,2021-05-01 05:18:58,"(2021, 17, 6)"
97,394,2898609,779e11c4ecdd52ce3527c79041a2bfd9,0 days 00:26:59.839427,at,Carinthia,46.808849,13.516052,297.0,85.0,...,u23d79616cfu,Austria,Carinthia,Spittal an der Drau,,Bezirk Spittal an der Drau,"Tauern Autobahn, Carinthia, 9871",Tauern Autobahn,2021-05-01 05:23:47,"(2021, 17, 6)"
98,396,3020066,b893c5148f184ce61dbd98d135885a83,0 days 00:26:59.839437,at,Carinthia,46.807758,13.518236,308.0,79.0,...,u23d795dvwx2,Austria,Carinthia,Spittal an der Drau,,Bezirk Spittal an der Drau,"Tauern Autobahn, Carinthia, 9871",Tauern Autobahn,2021-05-01 05:23:57,"(2021, 17, 6)"


#### The Dataset null values show that imputation is done with a few missing values (which do not exist in real), then it is acceptable.

In [22]:
df_grouped.isnull().sum()

level_0                                 0
index                                   0
vehicle__identification__otonomo_id     0
metadata__time__epoch                   0
location__country__code                 0
location__city__name                    0
location__latitude__value               0
location__longitude__value              0
mobility__heading__angle                1
mobility__speed__value                  0
location__altitude__value               0
metadata__provider__name                0
location__polygon__geohash              0
location__country__name                 0
location__state__name                   0
location__county__name                  1
location__village__name                37
location__neighbourhood__name          19
location__house__address                0
location__road__name                    0
datetime_metadata__time__epoch          0
date                                    0
dtype: int64

In [25]:
from branca.element import Figure
import folium 
from folium import plugins

fig5=Figure(height=550,width=750)
m5=folium.Map(location=[48.210033, 16.363449],tiles='cartodbpositron',zoom_start=14)
fig5.add_child(m5)

In [None]:
coords_1= []

#### Map to visualize the location of each city over it's latitude and longitutde 

In [23]:
# samples = new_df.head(50).sample(5)


map_h = folium.Map(location= [df_grouped.location__latitude__value.mean(), df_grouped.location__longitude__value.mean()],
                           zoom_start=8, control_scale=True)

for index, loc_info in df_grouped.iterrows():
    folium.Marker([loc_info['location__latitude__value'], loc_info['location__longitude__value']],
                  popup=loc_info['vehicle__identification__otonomo_id'],).add_to(map_h)

stationArr = df_grouped[['location__latitude__value', 'location__longitude__value']].values
map_h.add_child(plugins.HeatMap(stationArr, radius=1))
map_h

# Second part

#### Sorting dataset by vehicle__identification__otonomo_id and datetime_metadata__time__epoch

In [30]:
df_sorted = df_point.sort_values(by=['vehicle__identification__otonomo_id', 'datetime_metadata__time__epoch'])

In [31]:
df_sorted

Unnamed: 0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,location__altitude__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
3934864,2507406,00001ed912170b0bb60c32208227f2e7,0 days 00:27:04.547743,at,,47.29708,11.05300,246.0,117.0,,...,u0rv5d0t3wdp,Austria,Tyrol,Imst,Rietz,,,Inntal Autobahn,2021-06-24 17:15:43,"(2021, 25, 4)"
3934875,1559399,00001ed912170b0bb60c32208227f2e7,0 days 00:27:04.547751,at,,47.29655,11.04956,266.0,123.0,,...,u0rv56n437vn,Austria,Tyrol,Imst,Rietz,,,Inntal Autobahn,2021-06-24 17:15:51,"(2021, 25, 4)"
3934985,387047,00001ed912170b0bb60c32208227f2e7,0 days 00:27:04.547800,at,,47.29383,11.02689,260.0,120.0,,...,u0rv4ctd90vh,Austria,Tyrol,Imst,Rietz,,,Inntal Autobahn,2021-06-24 17:16:40,"(2021, 25, 4)"
3935158,386144,00001ed912170b0bb60c32208227f2e7,0 days 00:27:04.547910,at,,47.28783,11.00039,254.0,67.0,,...,u0rv426pxdmm,Austria,Tyrol,Imst,Rietz,,,Inntal Autobahn,2021-06-24 17:18:30,"(2021, 25, 4)"
3935258,2507047,00001ed912170b0bb60c32208227f2e7,0 days 00:27:04.547966,at,,47.28556,10.98717,258.0,65.0,,...,u0rv400d7e78,Austria,Tyrol,Imst,Rietz,,,Inntal Autobahn,2021-06-24 17:19:26,"(2021, 25, 4)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
817841,2525611,fffef923513e8ac39b50e4b907d3c30b,0 days 00:27:00.838258,at,,48.06170,14.28900,330.0,71.0,,...,u2d1mjkgty4x,Austria,Upper Austria,Steyr-Land,Sierning,,,Schiedlberger Straße,2021-05-12 18:50:58,"(2021, 19, 3)"
818051,1772536,fffef923513e8ac39b50e4b907d3c30b,0 days 00:27:00.838399,at,,48.08260,14.26860,300.0,71.0,,...,u2d1s9nj6qr3,Austria,Upper Austria,Steyr-Land,Schiedlberg,,,Sierningstraße,2021-05-12 18:53:19,"(2021, 19, 3)"
818401,517904,fffef923513e8ac39b50e4b907d3c30b,0 days 00:27:00.838685,at,,48.12240,14.24530,320.0,79.0,,...,u2d1u0mj7999,Austria,Upper Austria,Linz-Land,Piberbach,,,Schiedlberger Straße,2021-05-12 18:58:05,"(2021, 19, 3)"
818505,517971,fffef923513e8ac39b50e4b907d3c30b,0 days 00:27:00.838824,at,,48.13840,14.24010,340.0,36.0,,...,u2d1u536d2hj,Austria,Upper Austria,Linz-Land,Neuhofen an der Krems,,,Steyrer Straße,2021-05-12 19:00:24,"(2021, 19, 3)"


In [32]:
df_sorted.reset_index(inplace=True) ## reset index

#### We choose the 10 trips with more entries , there are some trips with only 1 entry

In [33]:
df_sorted['vehicle__identification__otonomo_id'].value_counts()[:10]

784ab0e895ef88bb44a7b6ccd6bb3eba    6773
3efff7a8204061c50aa679e8cc1de319    4938
929576715d586b095b4d545ecb2ae97e    3295
59f1efb109f0e836ac447737cc415f53    2842
1ab35e14ef1a441e3ac6555684b23f64    2723
70003f7559c117379a81d9d7adeb2864    2646
813c3a82aabc24b14580118608a5cb9a    2584
0c5582d5999245ffbf723b701a006e7d    2561
4ecc7e2f0dc761117c3cfb6a037cb647    2543
68b40f92a24e3984885162cbc2ed7f62    2524
Name: vehicle__identification__otonomo_id, dtype: int64

In [34]:
new_df = df_sorted[(df_sorted['vehicle__identification__otonomo_id']=='784ab0e895ef88bb44a7b6ccd6bb3eba') |
                   (df_sorted['vehicle__identification__otonomo_id']=='3efff7a8204061c50aa679e8cc1de319') |
                   (df_sorted['vehicle__identification__otonomo_id']=='929576715d586b095b4d545ecb2ae97e') |
                   (df_sorted['vehicle__identification__otonomo_id']=='59f1efb109f0e836ac447737cc415f53') |
                   (df_sorted['vehicle__identification__otonomo_id']=='1ab35e14ef1a441e3ac6555684b23f64') |
                   (df_sorted['vehicle__identification__otonomo_id']=='70003f7559c117379a81d9d7adeb2864') |
                   (df_sorted['vehicle__identification__otonomo_id']=='813c3a82aabc24b14580118608a5cb9a') |
                   (df_sorted['vehicle__identification__otonomo_id']=='0c5582d5999245ffbf723b701a006e7d') |
                   (df_sorted['vehicle__identification__otonomo_id']=='4ecc7e2f0dc761117c3cfb6a037cb647') |
                   (df_sorted['vehicle__identification__otonomo_id']=='68b40f92a24e3984885162cbc2ed7f62')]

In [35]:
new_df

Unnamed: 0,level_0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
352473,3160300,1117820,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734389130,at,,47.377201,13.419481,327.0,9.0,...,u23t92gb0w6k,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:49.130,"(2021, 24, 2)"
352474,3160303,1118026,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734390130,at,,47.377214,13.419457,299.0,8.0,...,u23t92gb20b6,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:50.130,"(2021, 24, 2)"
352475,3160305,1118125,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734391130,at,,47.377225,13.419429,306.0,9.0,...,u23t92g8r6g4,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:51.130,"(2021, 24, 2)"
352476,3160314,1117620,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734394190,at,,47.377260,13.419356,312.0,8.0,...,u23t92g8t9mp,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:54.190,"(2021, 24, 2)"
352477,3160329,2409859,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734399130,at,,47.377366,13.419155,309.0,16.0,...,u23t92g90vp4,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:59.130,"(2021, 24, 2)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4192709,773964,975029,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811952010,at,,47.384824,13.453382,76.0,15.0,...,u23td4hkwe26,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:32.010,"(2021, 19, 3)"
4192710,773965,40800,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811954010,at,,47.384846,13.453488,67.0,15.0,...,u23td4hs8zrx,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:34.010,"(2021, 19, 3)"
4192711,773966,905527,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811957010,at,,47.384893,13.453679,77.0,20.0,...,u23td4htj2mc,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:37.010,"(2021, 19, 3)"
4192712,773968,976087,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811958010,at,,47.384902,13.453753,82.0,20.0,...,u23td4htp1vr,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:38.010,"(2021, 19, 3)"


#### Dataset info. after grouping

In [36]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33429 entries, 352473 to 4192713
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype          
---  ------                               --------------  -----          
 0   level_0                              33429 non-null  int64          
 1   index                                33429 non-null  int64          
 2   vehicle__identification__otonomo_id  33429 non-null  object         
 3   metadata__time__epoch                33429 non-null  timedelta64[ns]
 4   location__country__code              33429 non-null  object         
 5   location__city__name                 3674 non-null   object         
 6   location__latitude__value            33429 non-null  float64        
 7   location__longitude__value           33429 non-null  float64        
 8   mobility__heading__angle             33429 non-null  float64        
 9   mobility__speed__value               33429 non-null  float64     

In [37]:
new_df.isnull().sum()

level_0                                    0
index                                      0
vehicle__identification__otonomo_id        0
metadata__time__epoch                      0
location__country__code                    0
location__city__name                   29755
location__latitude__value                  0
location__longitude__value                 0
mobility__heading__angle                   0
mobility__speed__value                     0
location__altitude__value                  0
metadata__provider__name                   0
location__polygon__geohash                 0
location__country__name                    0
location__state__name                   3674
location__county__name                 15862
location__village__name                30817
location__neighbourhood__name          22275
location__house__address               33350
location__road__name                    1780
datetime_metadata__time__epoch             0
date                                       0
dtype: int

In [38]:
# new_df['metadata__time__epoch'] = pd.to_timedelta(new_df['metadata__time__epoch'], errors = 'coerce')

#### The Dataset is too huge so we drop some rows

In [39]:
new_df = new_df.iloc[::5, :]

In [40]:
new_df

Unnamed: 0,level_0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
352473,3160300,1117820,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734389130,at,,47.377201,13.419481,327.0,9.0,...,u23t92gb0w6k,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:49.130,"(2021, 24, 2)"
352478,3160338,2412328,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734402130,at,,47.377453,13.418998,306.0,19.0,...,u23t92g3tm3g,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:20:02.130,"(2021, 24, 2)"
352483,3160369,2412059,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734415130,at,,47.377721,13.418181,275.0,16.0,...,u23t92fg4r34,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:20:15.130,"(2021, 24, 2)"
352488,3160793,1118374,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734587200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:23:07.200,"(2021, 24, 2)"
352493,3160868,1117157,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734612200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:23:32.200,"(2021, 24, 2)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4192690,773941,905655,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811898870,at,,47.384402,13.451646,238.0,9.0,...,u23td454nhf3,Austria,Salzburg,Sankt Johann im Pongau,,,,Gappenau,2021-05-12 11:31:38.870,"(2021, 19, 3)"
4192695,773947,905435,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811912970,at,,47.384433,13.451742,85.0,7.0,...,u23td45626d4,Austria,Salzburg,Sankt Johann im Pongau,,,,Gappenau,2021-05-12 11:31:52.970,"(2021, 19, 3)"
4192700,773953,974760,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811926010,at,,47.384504,13.451985,65.0,7.0,...,u23td456tzwt,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:06.010,"(2021, 19, 3)"
4192705,773958,974854,929576715d586b095b4d545ecb2ae97e,0 days 00:27:00.811940010,at,,47.384685,13.452638,66.0,17.0,...,u23td45gv1t8,Austria,Salzburg,Sankt Johann im Pongau,,,,Gaismairallee,2021-05-12 11:32:20.010,"(2021, 19, 3)"


#### Dataset info. after cleaning

In [41]:
new_df.isnull().sum()

level_0                                   0
index                                     0
vehicle__identification__otonomo_id       0
metadata__time__epoch                     0
location__country__code                   0
location__city__name                   5941
location__latitude__value                 0
location__longitude__value                0
mobility__heading__angle                  0
mobility__speed__value                    0
location__altitude__value                 0
metadata__provider__name                  0
location__polygon__geohash                0
location__country__name                   0
location__state__name                   745
location__county__name                 3172
location__village__name                6163
location__neighbourhood__name          4453
location__house__address               6668
location__road__name                    352
datetime_metadata__time__epoch            0
date                                      0
dtype: int64

#### Here is a function to get the address by location.
#### The function runs over the data in columns (location__latitude__value , location__longitude__value)

In [42]:
app = Nominatim(user_agent="coordinateconverter")

def get_address_by_location(latitude, longitude, language="en"):
    """This function returns an address as raw from a location
    will repeat until success"""
    # build coordinates string to pass to reverse() function
    coordinates = f"{latitude}, {longitude}"
    # sleep for a second to respect Usage Policy
    time.sleep(1)
    try:
        location = app.reverse(coordinates, language=language)
#         print(location)
        return location.raw
    except:
        return get_address_by_location(latitude, longitude)

In [43]:
# location_array=[] # list carries all address
# for i in range(0,len(new_df)):  # range(0,len(df_grouped))
#     location_array.append( (get_address_by_location(new_df['location__latitude__value'].values[i], new_df['location__longitude__value'].values[i])).get('address') )
# #     if i == 100:
# #         break
        

In [44]:
# location_array

In [45]:
# ## Save the location_array to file

# with open('location_array.txt', 'wb') as fp:
#     pickle.dump(location_array, fp)

In [46]:
## Load the location_array from file

with open ('location_array.txt', 'rb') as fp:
    location_array = pickle.load(fp)

#### Here is a function to get the altitude by location.
#### The function runs over the data's columns (location__latitude__value, location__longitude__value), based on open elevation data which in turn is based on SRTM.

In [47]:
def get_elevation(lat = None, long = None):
    '''
        script for returning elevation in m from lat, long
    '''
    if lat is None or long is None: return None
    
   
    query = ('https://api.opentopodata.org/v1/test-dataset'f'?locations={lat},{long}')
    
    # Request with a timeout for slow responses
    r = get(query, timeout = 20)
    

    # Only get the json response in case of 200 or 201
    if r.status_code == 200 or r.status_code == 201:
        elevation = json_normalize(r.json(), 'results')['elevation'].values[0]
    else: 
        elevation = None
    return elevation

In [48]:
# altitude=[]  # list to apend all altitude's values
# for i in range(0,len(new_df)):  # range(0,len(df_grouped))
#     lat = new_df['location__latitude__value'].values[i]
#     long = new_df['location__longitude__value'].values[i]
#     elevation = get_elevation(lat, long)
#     altitude.append(elevation)
#     time.sleep(1)  # Important to avoid error 'many requests'....sleep for a second to respect Usage Policy
# #     if i == 25:
# #         break

In [49]:
# ## Save the altitude to file

# with open('altitude.txt', 'wb') as fp:
#     pickle.dump(altitude, fp)

In [50]:
## Load the altitude from file

with open ('altitude.txt', 'rb') as fp:
    altitude = pickle.load(fp)

In [51]:
# len(altitude)

In [52]:
"""
The function (get_address_by_location) return back full addrees as dict.
So we will split the address to road, village, state, postcode and county and append each in a list to be used later.
"""

road=[]
village=[]
state=[]
postcode=[]
county=[]


for i in range(0,len(location_array)):
    road.append(location_array[i].get('road'))
    village.append(location_array[i].get('village'))
    state.append(location_array[i].get('state'))
    postcode.append(location_array[i].get('postcode'))
    county.append(location_array[i].get('county'))
    
    
"""
for loop to get all the city's names and append it to a list
"""

city=[]    

item = None    

for loc in location_array:   
    try:
        item = loc['city']
    except KeyError:      # handel None values KeyError
        item = loc['state']
        
    city.append(item)
    
"""
for loop to creat a customize address and append all in a list to be used later 
"""  
        
house_address = []

for loc in location_array:
    try: 
        location = loc['road'] +', '+  loc['state'] + ', '+ loc['postcode']  # full address 
    except KeyError:      # handel None values KeyError
        pass 
        
    house_address.append(location)


#### Now we can use the previous lists to creat a new DataFrame (df) contains the Dataset to be able to see it.

In [53]:
df = pd.DataFrame(road, columns =['road'])
df['village'] = pd.DataFrame({'village':village})
df['state'] = pd.DataFrame({'state':state})
df['city'] = pd.DataFrame({'city':city})
df['postcode'] = pd.DataFrame({'postcode':postcode})
df['county'] = pd.DataFrame({'county':county})
df['altitude (m)'] = pd.DataFrame({'altitude':altitude})
df['house_address'] = pd.DataFrame({'house_address':house_address})


In [54]:
# df

In [55]:
df.isnull().sum()

road              29
village          607
state              0
city               0
postcode           2
county           317
altitude (m)       0
house_address      0
dtype: int64

#### The process of imputation starts here, by assigning the 'df' values to 'df_point' missing values.

In [56]:
# len(df_grouped.loc[:49, 'location__altitude__value'])

In [57]:
size = len(new_df)
size

6686

In [58]:
# df['city'].values

In [59]:
try:
    new_df.loc[:size-1, 'location__altitude__value'] = df['altitude (m)'].values # impute 'location__altitude__value' by 'altitude' values


    new_df.loc[:size-1, 'location__city__name'] = df['city'].values               # impute 'location__city__name' by 'state' values

    new_df.loc[:size-1, 'location__neighbourhood__name'] = df['county'].values    # impute 'location__neighbourhood__name' by 'county' values

    new_df.loc[:size-1, 'location__village__name'] = df['village'].values         # impute 'location__village__name' by 'village' values

    new_df.loc[:size-1, 'location__road__name'] = df['road'].values               # impute 'location__road__name' by 'road' values

    new_df.loc[:size-1, 'location__house__address'] = df['house_address'].values  # impute 'location__house__address' by 'house_address' values

except:
    pass

In [60]:
new_df.head(10)

Unnamed: 0,level_0,index,vehicle__identification__otonomo_id,metadata__time__epoch,location__country__code,location__city__name,location__latitude__value,location__longitude__value,mobility__heading__angle,mobility__speed__value,...,location__polygon__geohash,location__country__name,location__state__name,location__county__name,location__village__name,location__neighbourhood__name,location__house__address,location__road__name,datetime_metadata__time__epoch,date
352473,3160300,1117820,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734389130,at,,47.377201,13.419481,327.0,9.0,...,u23t92gb0w6k,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:19:49.130,"(2021, 24, 2)"
352478,3160338,2412328,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734402130,at,,47.377453,13.418998,306.0,19.0,...,u23t92g3tm3g,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Schlatterbergweg,2021-06-15 07:20:02.130,"(2021, 24, 2)"
352483,3160369,2412059,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734415130,at,,47.377721,13.418181,275.0,16.0,...,u23t92fg4r34,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:20:15.130,"(2021, 24, 2)"
352488,3160793,1118374,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734587200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:23:07.200,"(2021, 24, 2)"
352493,3160868,1117157,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734612200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:23:32.200,"(2021, 24, 2)"
352498,3160943,918136,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734638200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:23:58.200,"(2021, 24, 2)"
352503,3161017,613713,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734663200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:24:23.200,"(2021, 24, 2)"
352508,3161088,612837,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734694200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:24:54.200,"(2021, 24, 2)"
352513,3161177,2411742,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734729200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:25:29.200,"(2021, 24, 2)"
352518,3161244,2410096,0c5582d5999245ffbf723b701a006e7d,0 days 00:27:03.734755200,at,,47.378289,13.416279,311.0,0.0,...,u23t92cqtcnz,Austria,Salzburg,Sankt Johann im Pongau,Altenmarkt im Pongau,Altenmarkt im Pongau,,Sportplatzstraße,2021-06-15 07:25:55.200,"(2021, 24, 2)"


#### The Dataset null values show that imputation is done with a few missing values (which do not exist in real), then it is acceptable.

In [61]:
new_df.isnull().sum()

level_0                                   0
index                                     0
vehicle__identification__otonomo_id       0
metadata__time__epoch                     0
location__country__code                   0
location__city__name                   5941
location__latitude__value                 0
location__longitude__value                0
mobility__heading__angle                  0
mobility__speed__value                    0
location__altitude__value                 0
metadata__provider__name                  0
location__polygon__geohash                0
location__country__name                   0
location__state__name                   745
location__county__name                 3172
location__village__name                6163
location__neighbourhood__name          4453
location__house__address               6668
location__road__name                    352
datetime_metadata__time__epoch            0
date                                      0
dtype: int64

In [62]:
df.isnull().sum()

road              29
village          607
state              0
city               0
postcode           2
county           317
altitude (m)       0
house_address      0
dtype: int64

#### Map to visualize the location of each trip over it's latitude and longitutde 

In [63]:
# samples = new_df.head(50).sample(5)

map_h = folium.Map(location= [new_df.location__latitude__value.mean(), new_df.location__longitude__value.mean()],
                           zoom_start=8, control_scale=True)
for index, loc_info in new_df.iterrows():
    folium.Marker([loc_info['location__latitude__value'], loc_info['location__longitude__value']],
                  popup=loc_info['vehicle__identification__otonomo_id'],).add_to(map_h)
stationArr = new_df[['location__latitude__value', 'location__longitude__value']].values
map_h.add_child(plugins.HeatMap(stationArr, radius=1))
map_h

#### Using the github repository https://github.com/remisalmon/gpx-interpolate

In [64]:
gpx_df_x_trip = new_df[['vehicle__identification__otonomo_id', 'location__latitude__value', 'location__longitude__value', 'location__altitude__value', 'metadata__time__epoch']].copy()

#### Converting metadata__time__epoch from timedelta64[ns] to int (seconds)

In [65]:
gpx_df_x_trip["metadata__time__epoch"] = gpx_df_x_trip["metadata__time__epoch"]  / np.timedelta64(1, "s")

In [66]:
# gpx_df_x_trip['vehicle__identification__otonomo_id'].unique()

In [67]:
gpx_df_trip1 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='0c5582d5999245ffbf723b701a006e7d')]
gpx_df_trip2 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='1ab35e14ef1a441e3ac6555684b23f64')]
gpx_df_trip3 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='3efff7a8204061c50aa679e8cc1de319')]
gpx_df_trip4 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='4ecc7e2f0dc761117c3cfb6a037cb647')]
gpx_df_trip5 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='59f1efb109f0e836ac447737cc415f53')]
gpx_df_trip6 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='68b40f92a24e3984885162cbc2ed7f62')]
gpx_df_trip7 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='70003f7559c117379a81d9d7adeb2864')]
gpx_df_trip8 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='784ab0e895ef88bb44a7b6ccd6bb3eba')]
gpx_df_trip9 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='813c3a82aabc24b14580118608a5cb9a')]
gpx_df_trip10 = gpx_df_x_trip[(gpx_df_x_trip['vehicle__identification__otonomo_id']=='929576715d586b095b4d545ecb2ae97e')]

In [68]:
def prepare_data(df):
    
    gpx_data_trip_x = df.to_dict('list')       
    gpx_data_trip_x['tzinfo'] = None

    # rename keys in dictionary

    gpx_data_trip_x['lat'] = gpx_data_trip_x.pop('location__latitude__value')
    gpx_data_trip_x['lon'] = gpx_data_trip_x.pop('location__longitude__value')
    gpx_data_trip_x['ele'] = gpx_data_trip_x.pop('location__altitude__value')
    gpx_data_trip_x['tstamp'] = gpx_data_trip_x.pop('metadata__time__epoch')
    
    return gpx_data_trip_x

In [69]:
gpx_data_trip1 = prepare_data(gpx_df_trip1)

In [70]:
gpx_data_interp_trip1 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip1, res=50.0))

In [71]:
gpx_data_interp_trip1

Unnamed: 0,lat,lon,ele,tstamp,tzinfo
0,47.377201,13.419481,847.000000,1623.734389,
1,47.377472,13.418952,848.121848,1623.734403,
2,47.377668,13.418354,850.276181,1623.734412,
3,47.377877,13.417729,853.403567,1623.734425,
4,47.378112,13.417073,857.594802,1623.734454,
...,...,...,...,...,...
459,47.342099,13.435065,992.202219,1623.751116,
460,47.342402,13.434560,988.945394,1623.751119,
461,47.342761,13.434226,985.851922,1623.751122,
462,47.343173,13.433963,982.869100,1623.751126,


In [72]:
app = Nominatim(user_agent="coordinateconverter")

def get_address_by_location(latitude, longitude, language="en"):
    """This function returns an address as raw from a location
    will repeat until success"""
    # build coordinates string to pass to reverse() function
    coordinates = f"{latitude}, {longitude}"
    # sleep for a second to respect Usage Policy
    time.sleep(1)
    try:
        location = app.reverse(coordinates, language=language)
#         print(location)
        return location.raw
    except:
        return get_address_by_location(latitude, longitude)

In [73]:
# get_address_by_location('48.189534', '16.379368')

In [74]:
location_array_new=[] # list carries all address
for i in range(0,len(gpx_data_interp_trip1)):
    lat = gpx_data_interp_trip1['lat'].values[i]
    lon = gpx_data_interp_trip1['lon'].values[i]
    location_array_new.append( (get_address_by_location(lat, lon)).get('address') )
        

In [75]:
## Save the new data to file

with open('trip1.txt', 'wb') as fp:
    pickle.dump(location_array_new, fp)

In [76]:
## Load the new data from file

with open ('trip1.txt', 'rb') as fp:
    location_array_new = pickle.load(fp)

In [77]:
invalid_indexes = []
for i, item in enumerate(location_array_new):
    try:
        print(item['road'])
    except:
        invalid_indexes.append(i)
    

Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Schlatterbergweg
Römerkellerweg
Michael-Walchhofer-Straße
Römerkellerweg
Michael-Walchhofer-Straße
Feldgasse
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchenseestraße
Zauchen

In [78]:
invalid_indexes

[]

In [79]:
gpx_data_trip2 = prepare_data(gpx_df_trip2)

In [80]:
gpx_data_interp_trip2 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip2, res=25.0))

In [81]:
gpx_data_trip3 = prepare_data(gpx_df_trip3)

In [82]:
gpx_data_interp_trip3 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip3, res=25.0))

In [83]:
gpx_data_trip4 = prepare_data(gpx_df_trip4)

In [84]:
gpx_data_interp_trip4 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip4, res=25.0))

In [85]:
gpx_data_trip5 = prepare_data(gpx_df_trip5)

In [86]:
gpx_data_interp_trip5 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip5, res=25.0))

In [87]:
gpx_data_trip6 = prepare_data(gpx_df_trip6)

In [88]:
gpx_data_interp_trip6 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip6, res=25.0))

In [89]:
gpx_data_trip7 = prepare_data(gpx_df_trip7)

In [90]:
gpx_data_interp_trip7 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip7, res=25.0))

In [91]:
gpx_data_trip8 = prepare_data(gpx_df_trip8)

In [92]:
gpx_data_interp_trip8 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip8, res=25.0))

In [93]:
gpx_data_trip9 = prepare_data(gpx_df_trip9)

In [94]:
gpx_data_interp_trip9 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip9, res=25.0))

In [95]:
gpx_data_trip10 = prepare_data(gpx_df_trip10)

In [96]:
gpx_data_interp_trip10 = pd.DataFrame.from_dict(gpx_interpolate(gpx_data_trip10, res=25.0))

In [97]:
points1 = []   
points2 = []   
points3 = []   
points4 = []   
points5 = []   
points6 = []    
points7 = [] 
points8 = [] 
points9 = [] 
points10 = []

map_x = folium.Map(location= [new_df.location__latitude__value.mean(), new_df.location__longitude__value.mean()],
                           zoom_start=8, control_scale=True)


for index, loc_info in gpx_data_interp_trip1.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='0c5582d5999245ffbf723b701a006e7d', 
                  icon=folium.Icon(color="purple")).add_to(map_x)
    points1.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points1, color="purple", weight=5, opacity=1).add_to(map_x)
    
for index, loc_info in gpx_data_interp_trip2.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='1ab35e14ef1a441e3ac6555684b23f64', 
                icon=folium.Icon(color="blue")).add_to(map_x)
    points2.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points2, color="blue", weight=5, opacity=1).add_to(map_x)
    
    
for index, loc_info in gpx_data_interp_trip3.iloc[::5, :].iterrows():
    
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='3efff7a8204061c50aa679e8cc1de319', 
            icon=folium.Icon(color="red")).add_to(map_x)
    points3.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points3, color="red", weight=5, opacity=1).add_to(map_x) 
    
    
for index, loc_info in gpx_data_interp_trip4.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='4ecc7e2f0dc761117c3cfb6a037cb647', 
                   icon=folium.Icon(color="darkred")).add_to(map_x)
    points4.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points4, color="darkred", weight=5, opacity=1).add_to(map_x) 
    
    
for index, loc_info in gpx_data_interp_trip5.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='59f1efb109f0e836ac447737cc415f53', 
                   icon=folium.Icon(color="black")).add_to(map_x)
    points5.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points5, color="black", weight=5, opacity=1).add_to(map_x)     

    
for index, loc_info in gpx_data_interp_trip6.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='68b40f92a24e3984885162cbc2ed7f62', 
                   icon=folium.Icon(color="gray")).add_to(map_x)
    points6.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points6, color="gray", weight=5, opacity=1).add_to(map_x) 
    
    
for index, loc_info in gpx_data_interp_trip7.iloc[::5, :].iterrows():
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='70003f7559c117379a81d9d7adeb2864', 
                   icon=folium.Icon(color="orange")).add_to(map_x)
    points7.append([loc_info['lat'], loc_info['lon']])
    folium.PolyLine(points7, color="orange", weight=5, opacity=1).add_to(map_x)  
       
        
for index, loc_info in gpx_data_interp_trip8.iloc[::5, :].iterrows():
    points8.append([loc_info['lat'], loc_info['lon']])
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='784ab0e895ef88bb44a7b6ccd6bb3eba', 
                  icon=folium.Icon(color="green")).add_to(map_x)
    folium.PolyLine(points8, color="green", weight=5, opacity=1).add_to(map_x)
    
    
for index, loc_info in gpx_data_interp_trip9.iloc[::5, :].iterrows():
    points9.append([loc_info['lat'], loc_info['lon']])
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='813c3a82aabc24b14580118608a5cb9a', 
                  icon=folium.Icon(color="white")).add_to(map_x)
    folium.PolyLine(points9, color="white", weight=5, opacity=1).add_to(map_x)

        
for index, loc_info in gpx_data_interp_trip10.iloc[::5, :].iterrows():
    points10.append([loc_info['lat'], loc_info['lon']])
    folium.Marker([loc_info['lat'], loc_info['lon']],popup='929576715d586b095b4d545ecb2ae97e', 
                  icon=folium.Icon(color="pink")).add_to(map_x)
    folium.PolyLine(points10, color="pink", weight=5, opacity=1).add_to(map_x)

    
map_x