# Master Document - Uber Challenge Team 3

## Statement of Purpose
The purpose of this document is to take the inital data frame created by the "starter notebook" and to add additional features that the members of our team have engineered to increase the predictive accuracy of our model. 

### Step 1: Starter Notebook

The first chunk of code is just formatting code borrowed from the starter notebook to set up our data frame in the correct format for submission - nothing else. 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#We'll generate this later, so don't download it if you're in a hurry
sample_sub = pd.read_csv('SampleSubmission.csv');sample_sub.head()

# Make sure train is in the folder where this notebook is running
data = pd.read_csv('train.csv', 
                   parse_dates = ['Occurrence Local Date Time'])
data = data.drop("Status", axis=1)
data["longitude"] = pd.to_numeric(data.longitude, errors='coerce')

# Train on 2017
train = data.loc[data['Occurrence Local Date Time'] < '2018-01-01']
train = train.loc[data['Occurrence Local Date Time'] >= '2017-01-01']

# Test locally on the last part of 2018
local_test = data.loc[data['Occurrence Local Date Time'] < '2019-01-01']
local_test = local_test.loc[local_test['Occurrence Local Date Time'] >= '2018-09-01']

# Create a dataframe with a column for each segment_id (sid)
# Each row represents an hour.
sids = data['road_segment_id'].unique()

dts = pd.date_range('2017-01-01',
                    '2018-01-01',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for sid in sids:
    tr[str(sid)] = 0
    events = train.loc[train['road_segment_id'] == sid]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), sid] = 1


# Reshape this as in sample submission
# I add some extra columns that may be useful
train = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in sids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in sids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in sids] for x in tr['datetime']]),
    'y':tr[sids].values.flatten()
})

# Same for local test (test from now on)
dts = pd.date_range('2018-09-01','2018-12-31',
                    freq="1h")
tr = pd.DataFrame({'datetime':dts})

for sid in sids:
    tr[str(sid)] = 0
    events = local_test.loc[local_test['road_segment_id'] == sid]
    dts = events['Occurrence Local Date Time'].dt.round('H')
    dates = dts.astype(str).unique()
    tr.loc[tr['datetime'].isin(dates), sid] = 1
    
test = pd.DataFrame({
    'datetime x segment_id':np.concatenate([[str(x) + " x " + str(c) 
                                             for c in sids] 
                                            for x in tr['datetime']]),
    'datetime':np.concatenate([[str(x) for c in sids] for x in tr['datetime']]),
    'segment_id':np.concatenate([[str(c) for c in sids] for x in tr['datetime']]),
    'y':tr[sids].values.flatten()
})

train['datetime'] = pd.to_datetime(train['datetime'])
train['day'] = train['datetime'].dt.day_name()
train['min'] = train['datetime'].dt.hour*60+train['datetime'].dt.minute
locations = data.groupby('road_segment_id').mean()[['longitude', 'latitude']]
locations.head(2)
train = pd.merge(train, locations, left_on='segment_id', right_on='road_segment_id')
test['datetime'] = pd.to_datetime(test['datetime'])
test['day'] = train['datetime'].dt.day_name()
test['min'] = train['datetime'].dt.hour*60+train['datetime'].dt.minute
locations = data.groupby('road_segment_id').mean()[['longitude', 'latitude']]
locations.head(2)
test = pd.merge(test, locations, left_on='segment_id', right_on='road_segment_id')

### Step 2: Feature Engineering

Now that our data is in an acceptable format for submission, the next step is to append additional predictors to our data frame to hopefully improve model accuracy.

#### i. Time Data

We will be adding the day of the week that the incident occurred on, as well as the period of the day (e.g. morning, afternoon, night). 

In [2]:
# first examine data frame
train.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min,longitude,latitude
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0,Sunday,0,18.541422,-33.888613
1,2017-01-01 01:00:00 x S0B3CGQ,2017-01-01 01:00:00,S0B3CGQ,0,Sunday,60,18.541422,-33.888613
2,2017-01-01 02:00:00 x S0B3CGQ,2017-01-01 02:00:00,S0B3CGQ,0,Sunday,120,18.541422,-33.888613
3,2017-01-01 03:00:00 x S0B3CGQ,2017-01-01 03:00:00,S0B3CGQ,0,Sunday,180,18.541422,-33.888613
4,2017-01-01 04:00:00 x S0B3CGQ,2017-01-01 04:00:00,S0B3CGQ,0,Sunday,240,18.541422,-33.888613


In [3]:
test.head()

Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min,longitude,latitude
0,2018-09-01 00:00:00 x S0B3CGQ,2018-09-01 00:00:00,S0B3CGQ,0,Sunday,0,18.541422,-33.888613
1,2018-09-01 01:00:00 x S0B3CGQ,2018-09-01 01:00:00,S0B3CGQ,0,Monday,1260,18.541422,-33.888613
2,2018-09-01 02:00:00 x S0B3CGQ,2018-09-01 02:00:00,S0B3CGQ,0,Wednesday,1080,18.541422,-33.888613
3,2018-09-01 03:00:00 x S0B3CGQ,2018-09-01 03:00:00,S0B3CGQ,0,Friday,900,18.541422,-33.888613
4,2018-09-01 04:00:00 x S0B3CGQ,2018-09-01 04:00:00,S0B3CGQ,0,Sunday,720,18.541422,-33.888613


In [4]:
# turn variables into usable output

#datetime
from datetime import datetime
train["datetime"] = pd.to_datetime(train['datetime'])
train['period'] = (train['datetime'].dt.hour % 24 + 4) // 4
train['period'].replace({1: 'Late Night',
                      2: 'Early Morning',
                      3: 'Morning',
                      4: 'Noon',
                      5: 'Evening',
                      6: 'Night'}, inplace=True)
train["period"] = train["period"].astype('category')
# longitude                      object
df = train[train.longitude != "Closed"] # there are 5 rows which have 'closed' as the value instead of longitude, in my opinion this is nowhere near enough data lost to warrant imputation or similar
train["longitude"] = train["longitude"].astype(float)
# latitude                      float64
    # fine
# road_segment_id                object
train["segment_id"] = train["segment_id"].astype('category')
train.head()

  res_values = method(rvalues)


Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min,longitude,latitude,period
0,2017-01-01 00:00:00 x S0B3CGQ,2017-01-01 00:00:00,S0B3CGQ,0,Sunday,0,18.541422,-33.888613,Late Night
1,2017-01-01 01:00:00 x S0B3CGQ,2017-01-01 01:00:00,S0B3CGQ,0,Sunday,60,18.541422,-33.888613,Late Night
2,2017-01-01 02:00:00 x S0B3CGQ,2017-01-01 02:00:00,S0B3CGQ,0,Sunday,120,18.541422,-33.888613,Late Night
3,2017-01-01 03:00:00 x S0B3CGQ,2017-01-01 03:00:00,S0B3CGQ,0,Sunday,180,18.541422,-33.888613,Late Night
4,2017-01-01 04:00:00 x S0B3CGQ,2017-01-01 04:00:00,S0B3CGQ,0,Sunday,240,18.541422,-33.888613,Early Morning


In [5]:
test["datetime"] = pd.to_datetime(test['datetime'])
test['period'] = (test['datetime'].dt.hour % 24 + 4) // 4
test['period'].replace({1: 'Late Night',
                      2: 'Early Morning',
                      3: 'Morning',
                      4: 'Noon',
                      5: 'Evening',
                      6: 'Night'}, inplace=True)
test["period"] = test["period"].astype('category')
# longitude                      object
df = test[test.longitude != "Closed"] # there are 5 rows which have 'closed' as the value instead of longitude, in my opinion this is nowhere near enough data lost to warrant imputation or similar
test["longitude"] = test["longitude"].astype(float)
# latitude                      float64
    # fine
# road_segment_id                object
test["segment_id"] = test["segment_id"].astype('category')
test.head()

  res_values = method(rvalues)


Unnamed: 0,datetime x segment_id,datetime,segment_id,y,day,min,longitude,latitude,period
0,2018-09-01 00:00:00 x S0B3CGQ,2018-09-01 00:00:00,S0B3CGQ,0,Sunday,0,18.541422,-33.888613,Late Night
1,2018-09-01 01:00:00 x S0B3CGQ,2018-09-01 01:00:00,S0B3CGQ,0,Monday,1260,18.541422,-33.888613,Late Night
2,2018-09-01 02:00:00 x S0B3CGQ,2018-09-01 02:00:00,S0B3CGQ,0,Wednesday,1080,18.541422,-33.888613,Late Night
3,2018-09-01 03:00:00 x S0B3CGQ,2018-09-01 03:00:00,S0B3CGQ,0,Friday,900,18.541422,-33.888613,Late Night
4,2018-09-01 04:00:00 x S0B3CGQ,2018-09-01 04:00:00,S0B3CGQ,0,Sunday,720,18.541422,-33.888613,Early Morning


#### Step 2: Add weather predictors to data frame

In [6]:
weather = pd.read_csv("weather_new.csv", parse_dates = ['Local time in Cape Town / Molteno Reservoir'])
weather = weather.sort_values(by =['Local time in Cape Town / Molteno Reservoir'])
weather.rename(columns = {'Local time in Cape Town / Molteno Reservoir':'datetime'}, inplace = True)
weather.rename(columns = {'air temperature':'airtemp'}, inplace = True)
weather.rename(columns = {'stmospheric pressure':'atmospres'}, inplace = True)
weather.rename(columns = {'relative humidity':'relhumid'}, inplace = True)
weather.rename(columns = {'wind direction':'windirect'}, inplace = True)
weather.rename(columns = {'maximum gust value':'maxgust'}, inplace = True)
weather.rename(columns = {'maximum air temperature':'maxair'}, inplace = True)
weather.rename(columns = {'minimum air temperature':'minair'}, inplace = True)
weather = weather.drop(["atmospres", 'Pa', 'Td', "airtemp", "relhumid", "maxair", "minair"], axis = 1)
weather.head()

Unnamed: 0,datetime,windirect,maxgust
11084,2016-01-08 02:00:00,Wind blowing from the north-west,6.0
11083,2016-01-08 05:00:00,Wind blowing from the east,3.0
11082,2016-01-08 11:00:00,Wind blowing from the south,2.0
11081,2016-01-08 14:00:00,Wind blowing from the north-northwest,4.0
11080,2016-01-08 17:00:00,"Calm, no wind",1.0


In [7]:
# change wind to boolean
a = weather.windirect.value_counts()
print (a)

Calm, no wind                            1534
Wind blowing from the west-southwest     1212
Wind blowing from the west               1149
Wind blowing from the south-east         1025
Wind blowing from the south-southeast     921
Wind blowing from the south               673
Wind blowing from the west-northwest      663
Wind blowing from the north-northeast     572
Wind blowing from the south-west          515
Wind blowing from the north-west          477
Wind blowing from the east-southeast      454
Wind blowing from the north               422
Wind blowing from the north-east          380
Wind blowing from the south-southwest     357
Wind blowing from the east                337
Wind blowing from the north-northwest     293
Wind blowing from the east-northeast      244
Name: windirect, dtype: int64


In [8]:
#get first value of index
vals = a[:1].index
print (vals)

Index(['Calm, no wind'], dtype='object')


In [9]:
weather['windirectnew'] = np.where(weather.windirect.isin(vals), 0,1)
weather = weather.drop(["windirect"], axis = 1)
print (weather)

                 datetime  maxgust  windirectnew
11084 2016-01-08 02:00:00      6.0             1
11083 2016-01-08 05:00:00      3.0             1
11082 2016-01-08 11:00:00      2.0             1
11081 2016-01-08 14:00:00      4.0             1
11080 2016-01-08 17:00:00      1.0             0
...                   ...      ...           ...
80    2020-12-07 11:00:00      4.0             1
79    2020-12-07 14:00:00      6.0             1
78    2020-12-07 17:00:00     11.0             1
77    2020-12-07 20:00:00      8.0             1
76    2020-12-07 23:00:00     11.0             1

[11228 rows x 3 columns]


In [10]:
# combine the weather and test and train data frames
weather = weather.fillna(1.0)
weather["datetime"] = pd.to_datetime(weather['datetime'])
train.sort_values('datetime', inplace=True)
test.sort_values('datetime', inplace=True)
weather.sort_values('datetime', inplace=True)

# create weather train and weather test splits
# Train on 2017
weathertrain = weather.loc[weather['datetime'] < '2018-01-01']
weathertrain = weathertrain.loc[weather['datetime'] >= '2017-01-01']
weathertest = weather.loc[weather['datetime'] < '2019-01-01']
weathertest = weathertest.loc[weather['datetime'] >= '2018-09-01']
weathertest.tail()

Unnamed: 0,datetime,maxgust,windirectnew
4369,2018-12-31 11:00:00,6.0,1
4368,2018-12-31 14:00:00,5.0,1
4367,2018-12-31 17:00:00,7.0,1
4366,2018-12-31 20:00:00,5.0,1
4365,2018-12-31 23:00:00,1.0,0


In [11]:
# impute average gust value via resampling (easiest way to add weather data, more complex ways could capture nuance of data better)
# impute average wind gust for day over data 
weathertrain = weathertrain.resample("1D", on = "datetime").mean()
weathertest = weathertest.resample("1D", on = "datetime").mean()

In [12]:
weathertrain = weathertrain.resample('H').pad()
weathertest = weathertest.resample('H').pad()

In [13]:
train1 = pd.merge_asof(weathertrain, train, on="datetime") 
#windirect new = percentage of time during day that there was wind
test1 = pd.merge_asof(weathertest, test, on="datetime")

In [14]:
train1.head()

Unnamed: 0,datetime,maxgust,windirectnew,datetime x segment_id,segment_id,y,day,min,longitude,latitude,period
0,2017-01-01 00:00:00,3.75,0.75,2017-01-01 00:00:00 x IQEYE3S,IQEYE3S,0,Sunday,0,18.677575,-33.897557,Late Night
1,2017-01-01 01:00:00,3.75,0.75,2017-01-01 01:00:00 x PRRLPFL,PRRLPFL,0,Sunday,60,18.446153,-33.933016,Late Night
2,2017-01-01 02:00:00,3.75,0.75,2017-01-01 02:00:00 x J8FAZRB,J8FAZRB,0,Sunday,120,18.94755,-33.782653,Late Night
3,2017-01-01 03:00:00,3.75,0.75,2017-01-01 03:00:00 x 8YJIRQ2,8YJIRQ2,0,Sunday,180,18.803257,-33.816684,Late Night
4,2017-01-01 04:00:00,3.75,0.75,2017-01-01 04:00:00 x C0RCCDP,C0RCCDP,0,Sunday,240,18.856803,-33.801712,Early Morning


In [15]:
test1.head()

Unnamed: 0,datetime,maxgust,windirectnew,datetime x segment_id,segment_id,y,day,min,longitude,latitude,period
0,2018-09-01 00:00:00,9.625,1.0,2018-09-01 00:00:00 x PV3AVCT,PV3AVCT,0,Wednesday,900,18.937523,-33.78542,Late Night
1,2018-09-01 01:00:00,9.625,1.0,2018-09-01 01:00:00 x GLFV0XV,GLFV0XV,0,Monday,300,18.976542,-33.761882,Late Night
2,2018-09-01 02:00:00,9.625,1.0,2018-09-01 02:00:00 x NCMIGB7,NCMIGB7,0,Saturday,840,18.614003,-34.016828,Late Night
3,2018-09-01 03:00:00,9.625,1.0,2018-09-01 03:00:00 x DZABHQW,DZABHQW,0,Friday,1320,18.635994,-34.002236,Late Night
4,2018-09-01 04:00:00,9.625,1.0,2018-09-01 04:00:00 x X9J910I,X9J910I,0,Monday,1080,18.929649,-34.144853,Early Morning


Modelling. Approach 1.

Use the model from the StarterNotebook - CatBoost

In [16]:
train1.dtypes

datetime                 datetime64[ns]
maxgust                         float64
windirectnew                    float64
datetime x segment_id            object
segment_id                     category
y                                 int64
day                              object
min                               int64
longitude                       float64
latitude                        float64
period                         category
dtype: object

In [17]:
from catboost import CatBoostClassifier

In [18]:
CatBoostClassifier?

In [19]:
model = CatBoostClassifier(iterations=100, 
                           loss_function='Logloss',
                           random_seed = 42,
                           verbose=False) 

x_cols = ['maxgust', 'windirectnew', 'segment_id', 'day', 'min', 'longitude', 'latitude', 'period']
cat_cols = ['segment_id', 'day', 'period']

In [20]:
model.fit(train1[x_cols], train1['y'], cat_features=cat_cols)

<catboost.core.CatBoostClassifier at 0x7fde0f3ac190>

In [21]:
from sklearn.metrics import log_loss
log_loss(train1['y'], model.predict_proba(train1[x_cols])[:, 1])

0.016282812396145496

In [22]:
# What about just predicting 0s?
log_loss(train1['y'], [0 for y in train1['y']])

0.10278221200271104

In [23]:
log_loss(test1['y'], model.predict_proba(test1[x_cols])[:, 1])

# not too bad - only slightly worse than for train

0.02836824004287106

F1 metric

In [24]:
# First, just using .predict
from sklearn.metrics import f1_score
f1_score(test1['y'], model.predict(test1[x_cols]))

0.0

In [25]:
# Let's predict 1 even if the prob is just > 0.005
test1['pred'] = model.predict_proba(test1[x_cols])[:,1]
test1['gt005'] = (test1['pred']>0.005).astype(int)
test1.head()

Unnamed: 0,datetime,maxgust,windirectnew,datetime x segment_id,segment_id,y,day,min,longitude,latitude,period,pred,gt005
0,2018-09-01 00:00:00,9.625,1.0,2018-09-01 00:00:00 x PV3AVCT,PV3AVCT,0,Wednesday,900,18.937523,-33.78542,Late Night,0.000814,0
1,2018-09-01 01:00:00,9.625,1.0,2018-09-01 01:00:00 x GLFV0XV,GLFV0XV,0,Monday,300,18.976542,-33.761882,Late Night,0.000965,0
2,2018-09-01 02:00:00,9.625,1.0,2018-09-01 02:00:00 x NCMIGB7,NCMIGB7,0,Saturday,840,18.614003,-34.016828,Late Night,0.002554,0
3,2018-09-01 03:00:00,9.625,1.0,2018-09-01 03:00:00 x DZABHQW,DZABHQW,0,Friday,1320,18.635994,-34.002236,Late Night,0.003208,0
4,2018-09-01 04:00:00,9.625,1.0,2018-09-01 04:00:00 x X9J910I,X9J910I,0,Monday,1080,18.929649,-34.144853,Early Morning,0.001001,0


In [26]:
f1_score(test1['y'], test1['gt005'])

# Interesting... the value seems to be extremely low, which is not great. We want f1 score to be as close to 1 as possible.

0.01176470588235294

In [27]:
import numpy as np
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [33]:
model_lgb = lgb.LGBMClassifier(learning_rate = 0.003,
                               boosting_type = 'gbdt',
                               objective = 'binary',
                               metric = 'binary_logloss',
                               sub_feature = 0.5,
                               num_leaves = 15,
                               min_data = 50,
                               max_depth = -1)

x_cols = ['maxgust', 'windirectnew', 'segment_id', 'min', 'longitude', 'latitude', 'period']
cat_cols = ['segment_id', 'day', 'period']

In [34]:
model_lgb.fit(train1[x_cols], train1['y'])

LGBMClassifier(learning_rate=0.003, metric='binary_logloss', min_data=50,
               num_leaves=15, objective='binary', sub_feature=0.5)

In [None]:
#that does not look right

In [37]:
y_pred=model_lgb.predict(test1[x_cols])

for i in range(0,1):
    if y_pred[i]>=.005:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0

In [39]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test1['y'], y_pred)
#Accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, test1['y'])

In [40]:
cm

array([[2892,    0],
       [  13,    0]])

In [41]:
accuracy

#are you sure about that

0.9955249569707401

In [None]:
import imblearn