# Uber Movement Competition

<b>Task Description:</b> The objective of this challenge is to build a machine learning model that accurately predicts when and where the next road incident will occur in Cape Town, South Africa. Data scientists will build their models on historic road incident data as well as traffic pattern data from Uber.

The resulting model will enable South African authorities to anticipate where they will be needed next and to put measures in place that will help ensure the safety of all Cape Town’s residents.

In [1]:
import pandas as pd
import pandas_profiling as pf 
import os
import numpy as np
import seaborn as sns
import matplotlib as plt
%matplotlib inline

ModuleNotFoundError: No module named 'pandas_profiling'

## 1. Train Dataset

### Loading the Train Dataset

In [None]:
train = pd.read_csv('Data/train.csv')

In [None]:
## Number of records and features
train.shape

In [None]:
### Date conversion
train['Occurrence Local Date Time'] = pd.to_datetime(train['Occurrence Local Date Time'])

In [None]:
train.head(3)

## 2. Road Segments Shapefile

### Loading the Shape Files - road_segments

In [None]:
import shapefile
road_segments = shapefile.Reader("Data/road_segments/road_segments.shp")

### Attributes of Dataset

In [None]:
print(road_segments)

### The Fields/Features

In [None]:
road_segments.fields

### Convert the Shapefiles to Pandas Dataframe

In [None]:
fields = [x[0] for x in road_segments.fields][1:]
records = [y[:] for y in road_segments.records()]
shps = [s.points for s in road_segments.shapes()]

road_segments_data = pd.DataFrame(columns = fields, data = records)
road_segments_data.head(3)

In [None]:
## The Shape of the Dataset
road_segments_data.shape

### Column headers in small letters

In [None]:
road_segments_data.columns = map(str.lower, road_segments_data.columns)

In [None]:
#road_segments_data.to_excel("Data/road_segments/road_segments.xlsx",sheet_name ='road_segments',index=False, header=True )

## 3. SANRAL Injuries2016_2019

### Loading the Dataset - SANRAL Injuries2016_2019

In [None]:
Injuries2016_2019 = pd.read_csv('Data/SANRAL_v2/Injuries2016_2019.csv')
Injuries2016_2019.head(3)

In [None]:
## Number of records and features
Injuries2016_2019.shape

In [None]:
Injuries2016_2019.info()

In [None]:
### Date feature conversion
Injuries2016_2019['Created Local Date Time'] = pd.to_datetime(Injuries2016_2019['Created Local Date Time'])

### Renaming the columns
This is to remove the white space

In [None]:
Injuries2016_2019=Injuries2016_2019.rename(columns = {'Network ID':'NetworkID'})
Injuries2016_2019=Injuries2016_2019.rename(columns = {'Event Id':'EventId'})
Injuries2016_2019=Injuries2016_2019.rename(columns = {'Created Local Date Time':'DateTime'})
Injuries2016_2019=Injuries2016_2019.rename(columns = {'No Injuries':'NoInjuries'})
Injuries2016_2019=Injuries2016_2019.rename(columns = {'Injury Type':'InjuryType'})

In [None]:
Injuries2016_2019[Injuries2016_2019.EventId == 99403]

### Reshaping the dataset from "long" to "wide" format

In [None]:
Injuries = Injuries2016_2019.pivot_table(index=['EventId','DateTime']
                                                    , columns='InjuryType'
                                                    , values='NoInjuries'
                                                   )
Injuries[:3]

### Flattening the pivoted dataset

In [None]:
# Flattening the pivoted dataset
Injuries_data = pd.DataFrame(Injuries.to_records())
Injuries_data[:3]

### Replace all NaN values with zero 

In [None]:
Injuries_data.fillna(0, inplace=True)

In [None]:
Injuries_data[Injuries2016_2019.EventId == 99403]

## 4. SANRAL Vehicles2016_2019

### Loading the Dataset - SANRAL Vehicles2016_2019

In [None]:
Vehicles2016_2019 = pd.read_csv('Data/SANRAL_v2/Vehicles2016_2019.csv')
Vehicles2016_2019.head(3)

In [None]:
## Number of records and features
Vehicles2016_2019.shape

In [None]:
### Converting CreatedLOcalDateTime to datetime data type
Vehicles2016_2019['CreatedLOcalDateTime'] = pd.to_datetime(Vehicles2016_2019['CreatedLOcalDateTime'])
Vehicles2016_2019.head(3)

In [None]:
###Dropping Unnamed: 5 columbn
del Vehicles2016_2019['Unnamed: 5']

In [None]:
Vehicles2016_2019.head(2)

### Checking the frequency distribution before re-categorising vehicle type

In [None]:
Vehicles2016_2019.groupby(['VehicleType']).size().reset_index(name='Counts')

### Re-Categorising the vehicle type values 

In [None]:
Vehicles2016_2019.VehicleType.replace(['Taxi', 'Unable to ID','f']
                                    , ['Minibus', 'Other', 'Other'], inplace=True)

In [None]:
### Checking the distibution again
Vehicles2016_2019.groupby(['VehicleType']).size().reset_index(name='Counts')

### 4a. Group data  by  vehicle type

In [None]:
Vehicles_Type_Grouped = Vehicles2016_2019.groupby(['EventID','CreatedLOcalDateTime','VehicleType']).size().reset_index(name='Counts')

In [None]:
Vehicles_Type_Grouped[Vehicles_Type_Grouped.EventID == 131798]

### Reshaping the dataset from "long" to "wide" format

In [None]:
Vehicles_Wide = Vehicles_Type_Grouped.pivot_table(index=['EventID','CreatedLOcalDateTime']
                                                    , columns='VehicleType'
                                                    , values='Counts'    )

In [None]:
Vehicles_Wide[:3]

In [None]:
# Flattening the pivoted dataset
Vehicles_Type_Data = pd.DataFrame(Vehicles_Wide.to_records())

In [None]:
#Replace all NaN values with zero
Vehicles_Type_Data.fillna(0, inplace=True)

In [None]:
Vehicles_Type_Data[Vehicles_Type_Data.EventID == 131798]

### 4a. Group data  by  vehicle colour

In [None]:
Vehicles2016_2019.groupby(['Color']).size().reset_index(name='Counts')

In [None]:
Vehicles_Color_Grouped = Vehicles2016_2019.groupby(['EventID','CreatedLOcalDateTime','Color']).size().reset_index(name='Counts')

In [None]:
Vehicles_Color_Grouped[:3]

In [None]:
Vehicles_Color_Grouped[Vehicles_Color_Grouped.EventID == 131798]

### Reshaping the dataset from "long" to "wide" format

In [None]:
Vehicles_color_Wide = Vehicles_Color_Grouped.pivot_table(index=['EventID','CreatedLOcalDateTime']
                                                    , columns='Color'
                                                    , values='Counts'    )

In [None]:
# Flattening the pivoted dataset
Vehicles_Color_Data = pd.DataFrame(Vehicles_color_Wide.to_records())

In [None]:
#Replace all NaN values with zero
Vehicles_Color_Data.fillna(0, inplace=True)

In [None]:
Vehicles_Color_Data[Vehicles_Color_Data.EventID == 131798]

## 5. Merging the Datasets 

In [None]:
print('Train Dataset = ',len(train))
print('road_segments_data Dataset = ',len(road_segments_data))
print('Injuries2016_2019 Dataset = ',len(Injuries_data))
print('Vehicles_Type_Data Dataset = ',len(Vehicles_Type_Data))
print('Vehicles_Color_Data Dataset = ',len(Vehicles_Color_Data))

### 5a. Merging Train+road_segments_data = Train_Road
Join on common features which is segment_id

In [None]:
# Renaming road_segment_id to segment_id in train dataset
train=train.rename(columns = {'road_segment_id':'segment_id'})

In [None]:
Train_Road = pd.merge(train, road_segments_data, how="left", on="segment_id")
print('Train_Road Dataset = ',len(Train_Road))

In [None]:
Train_Road.shape

In [None]:
Train_Road.head(2)

### 5b. Merging Train_Road+Injuries_data = Train_Road_Injuries
Join on common feature which is Event Id

In [None]:
Train_Road_Injuries = pd.merge(Train_Road, Injuries_data, on=['EventId'], how="left")
len(Train_Road_Injuries)

In [None]:
Train_Road_Injuries.shape

In [None]:
Train_Road_Injuries.head(2)

### 5c. Merging Train_Road_Injuries+Vehicles_Color_Data = Train_Road_Injuries_VType
Join on common feature which is Event Id

In [None]:
# Renaming EventID to EventId
Vehicles_Type_Data=Vehicles_Type_Data.rename(columns = {'EventID':'EventId'})

In [None]:
Train_Road_Injuries_VType = pd.merge(Train_Road_Injuries, Vehicles_Type_Data, on=['EventId'], how="left")
len(Train_Road_Injuries_VType)

In [None]:
Train_Road_Injuries_VType.shape

In [None]:
Train_Road_Injuries_VType.head(2)

### 5d. Merging Train_Road_Injuries_VType+Vehicles_Color_Data = Train_Road_Injuries_VType_VColor
Join on common feature which is Event Id

In [None]:
# Renaming EventID to EventId
Vehicles_Color_Data=Vehicles_Color_Data.rename(columns = {'EventID':'EventId'})

In [None]:
Train_Road_Injuries_VType_VColor = pd.merge(Train_Road_Injuries_VType, Vehicles_Color_Data, on=['EventId'], how="left")
len(Train_Road_Injuries_VType_VColor)

In [None]:
Train_Road_Injuries_VType_VColor.shape

In [None]:
Train_Road_Injuries_VType_VColor.head(2)

In [None]:
Train_Road_Injuries_VType_VColor.info()

### Saving the data as an Excel file

In [None]:
Train_Road_Injuries_VType_VColor.to_excel("Data/Train_Road_Injury_Type_Color.xlsx"
                                          ,sheet_name ='Train_Road_Injury_Type_Color'
                                          ,index=False
                                          ,header=True )

## 6. Adding Date Related Features

<img src="Images/Date_Related_Features.png">

In [None]:
# Short name for the dataset
MergedData = Train_Road_Injuries_VType_VColor

In [None]:
# Date Time
MergedData['DateTime']=MergedData['Occurrence Local Date Time']

### 6i. Datatime_x_segment_id

In [None]:
MergedData['Datatime_x_segment_id'] = MergedData['DateTime'].astype('str')+' x '+MergedData['segment_id']
MergedData[['DateTime','segment_id','Datatime_x_segment_id']].head(2)

### 6ii. Year

In [None]:
MergedData['year'] = MergedData['DateTime'].dt.year
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year']].head(1)

### 6iii. Month

In [None]:
MergedData['month'] = MergedData['DateTime'].dt.month
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month']].head(1)

### 6iv. Week

In [None]:
MergedData['week'] = MergedData['DateTime'].dt.week
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week']].head(1)

### 6v. Day of the month

In [None]:
MergedData['day_of_month'] = MergedData['DateTime'].dt.day
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week','day_of_month']].head(1)

### 6vi. Day of the week

In [None]:
MergedData['day_of_week'] = MergedData['DateTime'].dt.weekday
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week','day_of_month','day_of_week']].head(1)

### 6vii. Day Name

In [None]:
MergedData['DayName'] = MergedData['DateTime'].dt.weekday_name
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week','day_of_month','day_of_week','DayName']].head(1)

### 6viii. Hour

In [None]:
MergedData['hour'] = MergedData['DateTime'].dt.hour
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week'
           ,'day_of_month','day_of_week','DayName','hour']].head(1)

### 6ix. IsWeekend

In [None]:
def IsWeekend(x):
    if x in [4, 5, 6]:
        return 1
    else:
        return 0
    
MergedData['IsWeekend'] = MergedData['day_of_week'].apply(IsWeekend)    
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week'
           ,'day_of_month','day_of_week','DayName','hour','IsWeekend']].head(1)

### 6ix. YearMonthDay

In [None]:
#Convert to string 
MergedData['year'] = MergedData['year'].astype('str')
MergedData['year2Digit'] = MergedData['year'].str[2:]
MergedData['month'] = MergedData['month'].astype('str')
MergedData['day_of_month'] = MergedData['day_of_month'].astype('str')

In [None]:
MergedData[['year','year2Digit']].head(2)

In [None]:
MergedData['YearMonthDay'] = (MergedData['year2Digit']+MergedData['month']+MergedData['day_of_month']).astype('int')
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week'
           ,'day_of_month','day_of_week','DayName','hour','IsWeekend','YearMonthDay']].head(1)

### 6x. IsHoliday

In [None]:
def IsHoliday(x):
    if x in [1611,16321,16325,16328,16427,1652,16616,1683,1689,16924,161216,161224,161225,161226,161227,161228,161229,161230,161231, #2016
             1711,1712,321,17414,17417,17427,1751,17616,1789,17925,171216,171224,171225,171226,171227,171228,171229,171230,171231,  #2017
             1811,18321,18330,1842,18427,1851,18616,1889,18924,181217,181224,181225,181226,181227,181228,181229,181230,181231  #2018
            ]:
        return 1
    else:
        return 0

In [None]:
MergedData['IsHoliday'] = MergedData['YearMonthDay'].apply(IsHoliday)    
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week','day_of_month'
           ,'day_of_week','DayName','hour','IsWeekend','YearMonthDay','IsHoliday']].head(1)

### 6xi. PayDay

In [None]:
def PayDay(x):
    if x >= 25 and x<=31:
        return 1
    else:
        return 0
    
MergedData['PayDay'] = MergedData['day_of_week'].apply(PayDay)    
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week'
           ,'day_of_month','day_of_week','DayName','hour','IsWeekend','PayDay']].head(1)

### 6xi. PayDay Teacher

In [None]:
def PayDayTeacher(x):
    if x == 221:
        return 1
    else:
        return 0
    
MergedData['PayDayTeacher'] = MergedData['day_of_week'].apply(PayDayTeacher)    
MergedData[['DateTime','segment_id','Datatime_x_segment_id','year','month','week'
           ,'day_of_month','day_of_week','DayName','hour','IsWeekend','PayDay','PayDayTeacher']].head(1)

## 7. Geo-Spatial Features

In [None]:
import csv
import codecs
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
geolocator = Nominatim()

In [None]:
GeoData = MergedData[['EventId','latitude', 'longitude']]
GeoData.head(3)

In [None]:
GeoData.info()

In [94]:
#Saving the file
# GeoData.to_csv('Data/geodata/geolocation.csv', index=None)

### Extracting the geo info

In [105]:
location = geolocator.reverse("-33.813054704,18.8162563067")

In [106]:
location.raw

{'place_id': 69315483,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 4315627,
 'lat': '-33.8130572774946',
 'lon': '18.8162570265999',
 'display_name': 'N1, Cape Town Ward 101, Stellenbosch Local Municipality, City of Cape Town, Western Cape, 7570, South Africa',
 'address': {'road': 'N1',
  'suburb': 'Cape Town Ward 101',
  'city': 'Stellenbosch Local Municipality',
  'county': 'City of Cape Town',
  'state': 'Western Cape',
  'postcode': '7570',
  'country': 'South Africa',
  'country_code': 'za'},
 'boundingbox': ['-33.8384734', '-33.798667', '18.7261517', '18.8706849']}

In [102]:
print(location.raw['address']['suburb'])

Montague


In [103]:
import geocoder

In [112]:
geolocation = geocoder.osm([-33.813054704,18.8162563067], method='reverse')
geolocation.geojson

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {'accuracy': 0.001,
    'address': 'N1, Cape Town Ward 101, Stellenbosch Local Municipality, City of Cape Town, Western Cape, 7570, South Africa',
    'bbox': [18.7261517, -33.8384734, 18.8706849, -33.798667],
    'city': 'Stellenbosch Local Municipality',
    'confidence': 4,
    'country': 'South Africa',
    'country_code': 'za',
    'county': 'City of Cape Town',
    'importance': 0.001,
    'lat': -33.8180056,
    'lng': 18.7985569,
    'ok': True,
    'osm_id': 4315627,
    'osm_type': 'way',
    'place_id': 69315483,
    'place_rank': 26,
    'postal': '7570',
    'quality': 'motorway',
    'raw': {'place_id': 69315483,
     'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
     'osm_type': 'way',
     'osm_id': 4315627,
     'boundingbox': ['-33.8384734', '-33.798667', '18.7261517', '18.8706849'],
     'lat': '-33.8180056',
     'lon': '18.7985569',
     'display_

In [113]:
g = geocoder.osm('Cape Town Ward 101')
g.json

{'accuracy': 0.65,
 'address': 'Cape Town Ward 101, Kraaifontein, City of Cape Town, Western Cape, South Africa',
 'bbox': {'northeast': [-33.8265909, 18.76839],
  'southwest': [-33.8565229, 18.7356129]},
 'confidence': 7,
 'country': 'South Africa',
 'country_code': 'za',
 'county': 'City of Cape Town',
 'icon': 'https://nominatim.openstreetmap.org/images/mapicons/poi_boundary_administrative.p.20.png',
 'importance': 0.65,
 'lat': -33.8416578,
 'lng': 18.7444804254589,
 'ok': True,
 'osm_id': 6580086,
 'osm_type': 'relation',
 'place_id': 199082370,
 'place_rank': 20,
 'quality': 'administrative',
 'raw': {'place_id': 199082370,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'relation',
  'osm_id': 6580086,
  'boundingbox': ['-33.8565229', '-33.8265909', '18.7356129', '18.76839'],
  'lat': '-33.8416578',
  'lon': '18.7444804254589',
  'display_name': 'Cape Town Ward 101, Kraaifontein, City of Cape Town, Western Cape, South Africa',

In [110]:
with open('Data/geodata/geolocation.csv', 'rb') as f:
    reader = csv.DictReader(codecs.iterdecode(f, 'utf-8'))
    for row in reader:
        location = geolocator.reverse(", ".join([row['latitude'], row['longitude']]))
        print(location.raw['address']['suburb'])

Montague
Cape Town Ward 84
Welcome
Cape Town Ward 8
Cape Town Ward 15
Tyger Valley
Cape Town Ward 57
Cape Town Ward 18
Cape Town Ward 18
Mowbray
Cape Town Ward 57
Cape Town Ward 19
Cape Town Ward 15
Maitland
Maitland Garden Village
Cape Town Ward 15
Mowbray
Cape Town Ward 3
Milnerton
Drakenstein Ward 15
Drakenstein Ward 15
Cape Town Ward 36
Cape Town Ward 96
Stellenbosch Ward 18
Cape Town Ward 96
Cape Town Ward 84
Cape Town Ward 84
Maitland
Cape Town Ward 101
Cape Town Ward 12
Cape Town Ward 44
Cape Town Ward 12
Cape Town Ward 18
Mowbray
Cape Town Ward 36
Cape Town Ward 55
Cape Town Ward 36
Cape Town Ward 15
Cape Town Ward 15
Cape Town Ward 84
Cape Town Ward 21
Mowbray
Cape Town Ward 15
Cape Town Ward 109
Stellenbosch Ward 18
Northgate
Cape Town Ward 84
Cape Town Ward 84
Cape Town Ward 12
Cape Town Ward 84
Plattekloof
Cape Town Ward 2
Cape Town Ward 115
Cape Town Ward 89
Drakenstein Ward 15
Cape Town Ward 2
Cape Town Ward 102
Cape Town Ward 19
Cape Town Ward 40
Cape Town Ward 12
Observ

GeocoderTimedOut: Service timed out

In [146]:
GeoFile = open('Data/geodata/geolocation.csv', 'rb')
reader = csv.reader(codecs.iterdecode(GeoFile, 'utf-8'))
for row in reader:
    location = geolocator.reverse(", ".join([row['latitude'], row['longitude']]), timeout=10)
    #print(location.raw['address'])

TypeError: list indices must be integers or slices, not str