In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Importing Libraries
!pip3 install graphviz
!pip3 install dask
!pip install "dask[complete]" 
!pip3 install toolz
!pip3 install cloudpickle
!pip install scikit-learn -U
# https://www.youtube.com/watch?v=ieW3G7ZzRZ0
# https://github.com/dask/dask-tutorial
# please do go through this python notebook: https://github.com/dask/dask-tutorial/blob/master/07_dataframe.ipynb
import dask.dataframe as dd#similar to pandas

import pandas as pd#pandas to create small dataframes 

# if this doesnt work refere install_folium.JPG in drive
import folium #open street map

# unix time: https://www.unixtimestamp.com/
import datetime #Convert to unix time

import time #Convert to unix time

# if numpy is not installed already : pip3 install numpy
import numpy as np#Do aritmetic operations on arrays

# matplotlib: used to plot graphs
import matplotlib
# matplotlib.use('nbagg') : matplotlib uses this protocall which makes plots more user intractive like zoom in and zoom out
matplotlib.use('nbagg')
import matplotlib.pylab as plt
import seaborn as sns#Plots
from matplotlib import rcParams#Size of plots  
import json

# this lib is used while we calculate the stight line distance between two (lat,lon) pairs in miles
!pip install gpxpy
import gpxpy.geo #Get the haversine distance

from sklearn.cluster import MiniBatchKMeans, KMeans#Clustering
import math
import pickle
import os
from IPython.display import display

# download migwin: https://mingw-w64.org/doku.php/download/mingw-builds
# install it in your system and keep the path, migw_path ='installed path'
mingw_path = 'C:\Program Files (x86)\mingw-w64\i686-8.1.0-posix-dwarf-rt_v6-rev0\mingw32\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

# to install xgboost: pip3 install xgboost
# if it didnt happen check install_xgboost.JPG
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")


## Clustering

Here we do the clustering and save the resulting data as a .npy file

In [None]:
def unix2datetime(unx):
  return datetime.datetime.utcfromtimestamp(int(unx))

def datetime2unix(dt):
  return time.mktime(dt.timetuple())

def make_clusters(df, n_clusters):
  coords = df[['pickup_latitude', 'pickup_longitude']].values
  kmns = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000,random_state=0).fit(coords)
  df['pickup_cluster'] = kmns.predict(coords)
  return kmns


def make_bins(df, bin_mins):
  min_unix = df['pickup_times'].min()
  min_dt = unix2datetime(min_unix)
  year, month = min_dt.year, min_dt.month
  start_dt = datetime.date(year, month, 1)
  start_unix = datetime2unix(start_dt)

  period = bin_mins * 60
  bins = []
  for pu in df['pickup_times'].values:
    bins.append(int((pu - start_unix) // period))

  df['pickup_bins'] = bins

In [None]:
def visualize_cluster_centers_map(centers):
  cluster_len = len(cluster_centers)
  map_osm = folium.Map(location=[40.734695, -73.990372], tiles='Stamen Toner')
  for i in range(cluster_len):
      folium.Marker(list((cluster_centers[i][0],cluster_centers[i][1])), popup=(str(cluster_centers[i][0])+str(cluster_centers[i][1]))).add_to(map_osm)
  display(map_osm)

def plot_clusters(df):
  city_long_border = (-74.03, -73.75)
  city_lat_border = (40.63, 40.85)
  fig, ax = plt.subplots(figsize=(15,15),ncols=1, nrows=1)
  ax.scatter(df.pickup_longitude.values[:100000], df.pickup_latitude.values[:100000], s=10, lw=0,
                c=df.pickup_cluster.values[:100000], cmap='tab20', alpha=0.2)
  ax.set_xlim(city_long_border)
  ax.set_ylim(city_lat_border)
  ax.set_xlabel('Longitude')
  ax.set_ylabel('Latitude')
  plt.show()

In [None]:
def df2numpy(df, num_clusters):
  num_bins = df['pickup_bins'].max() + 1
  data = np.zeros((num_bins,num_clusters))
  for i in range(num_bins):
    cur_bin = df[df['pickup_bins'] == i]['pickup_cluster']
    for pu in cur_bin:
      data[i][pu] += 1
  return data

def get_name(year, month, num_clusters, bin_mins):
  if month <= 10:
    month = "0" + str(month)
  name = "{}-{}_{}_{}.npy".format(year, month, num_clusters, bin_mins)
  return name

def save_data(data, year, month, num_clusters, bin_mins, save_dir="/content/gdrive/MyDrive/New_York_Data/Clustered/"):
  name = get_name(year, month, num_clusters, bin_mins)
  save_path = os.path.join(save_dir, name)
  if os.path.exists(save_path):
    print("{} already exists, skipping...".format(save_path))
    return save_path
  with open(save_path, "wb") as f:
    np.save(f, data)
  return save_path

In [None]:
def get_cluster_name(year, month, num_clusters, suffix=''):
  if month <= 10:
    month = "0" + str(month)
  name = "cluster_centers_{}-{}{}_{}.npy".format(year, month, suffix, num_clusters)
  return name

def save_cluster_centers(data, year, month, num_clusters, \
                         save_dir="/content/gdrive/MyDrive/New_York_Data/Clustered/"):
  name = get_cluster_name(year, month, num_clusters)
  path = os.path.join(save_dir, name)
  if os.path.exists(path):
    print("{} already exists, skipping...".format(path))
  else:
    with open(path, 'wb') as f:
      np.save(f, data, allow_pickle=True)
  return path

def get_cluster_grid_name(year, month, grid_resolution, suffix=''):
  if month <= 10:
    month = "0" + str(month)
  name = "cluster_grid_{}-{}{}_{}.npy".format(year, month, suffix, grid_resolution)
  return name

def save_cluster_grid(
    kmns,
    grid_resolution=200,
    grid_corners=((40.5774, -74.15), (40.9176,-73.7004)),
    save_dir="/content/gdrive/MyDrive/New_York_Data/Clustered/"
    ):
  
  latitudes = np.linspace(grid_corners[0][0], grid_corners[1][0], grid_resolution)
  longitudes = np.linspace(grid_corners[0][1], grid_corners[1][1], grid_resolution)
  
  X, Y = np.meshgrid(latitudes, longitudes)
  predicted = kmns.predict(np.vstack([X.flatten(), Y.flatten()]).T)
  grid = np.stack((predicted.reshape((grid_resolution, grid_resolution)), X, Y), axis=2)
  name = get_cluster_grid_name(year, month, grid_resolution)
  path = os.path.join(save_dir, name)
  if os.path.exists(path):
    print("{} already exists, skipping...".format(path))
  else:
    with open(path, 'wb') as f:
      np.save(f, grid, allow_pickle=True)
  return path
  

def load_cleaned_data(year, month, load_dir="/content/gdrive/MyDrive/New_York_Data/clean/"):
  if month <= 10:
    month = "0" + str(month)
  name = "clean_yellow_tripdata_{}-{}.csv".format(year, month)
  path = os.path.join(load_dir, name)
  if not os.path.exists(path):
    raise Exception("{} does not exist...".format(path))
  cleaned_data = pd.read_csv(path)
  return cleaned_data

def preprocess2npy(df, year, month, num_clusters, bin_mins, save_dir="/content/gdrive/MyDrive/New_York_Data/Clustered/"):
  print("Making clusters...")
  kmns = make_clusters(df, num_clusters)
  print("Saving cluster centers...")
  path = save_cluster_centers(kmns.cluster_centers_, year, month, num_clusters, save_dir)
  print("Cluster centers saved to {} ...".format(path))
  print("Computing cluster grid...")
  path = save_cluster_grid(kmns, 200)
  print("Cluster grid saved to {} ...".format(path))
  print("Making bins...")
  make_bins(df, bin_mins)
  data = df2numpy(df, num_clusters)
  print("Saving...")
  save_path = save_data(data, year, month, num_clusters, bin_mins, save_dir)
  print("Saved to", save_path)
  return save_path

In [None]:
cleaned_data = pd.read_csv("/content/gdrive/MyDrive/New_York_Data/clean/clean_yellow_tripdata_2016-02.csv")

In [None]:
preprocess2npy(cleaned_data, 2016, 2, 125, 30)

Making clusters...
Saving cluster centers...
/content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_125.npy already exists, skipping...
Cluster centers saved to /content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_125.npy ...
Computing cluster grid...
Cluster grid saved to /content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_125.npy ...
Making bins...
Saving...
/content/gdrive/MyDrive/New_York_Data/Clustered/2016-02_125_30.npy already exists, skipping...
Saved to /content/gdrive/MyDrive/New_York_Data/Clustered/2016-02_125_30.npy


'/content/gdrive/MyDrive/New_York_Data/Clustered/2016-02_125_30.npy'

# LSTM

We attempted at using LSTM for time-series analysis. This attempt got aborted due to time limitations

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

from sklearn.preprocessing import MinMaxScaler

In [None]:
def split_data(data, window_size=6):
  seq_data = []
  row, col = data.shape
  for i in range(0, row - window_size):
    inp = data[i:i+window_size]
    targ = data[window_size]
    seq_data.append((inp, targ))
  return seq_data

In [None]:
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]

In [None]:
scaler = MinMaxScaler()
train_norm = scaler.fit_transform(train_data)
test_norm = scaler.transform(test_data)

In [None]:
train_norm = torch.FloatTensor(train_norm)
test_norm = torch.FloatTensor(test_norm)

In [None]:
seq_data = split_data(train_norm)

In [None]:
class LSTM(nn.Module):

  def __init__(self, input_size, hidden_layer_size, output_size):
    super().__init__()
    self.hidden_layer_size = hidden_layer_size
    self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
    self.linear = nn.Linear(hidden_layer_size, output_size)
    self.hidden_cell = (torch.zeros(1, 1, self.output_size), torch.zeros(1, 1, self.hidden_dim))

  def forward(self, x):
    lstm_out, self.hidden_cell = self.lstm(x, self.hidden_cell)
    predictions = self.linear()

In [None]:
for inp, targ in seq_data:
  y_pred = model(inp)
  loss = 

# Preprocess data

Here, data is preprocessed to make it ready for training. Each sample in the training data has following features:

- Number of pickups at t-5
- Number of pickups at t-4
- Number of pickups at t-3
- Number of pickups at t-2
- Number of pickups at t-1
- Longitude
- Latitude
- Weekday

The output is the number of pickups at t

In [None]:
def load_data(year, month, num_clusters, bin_mins, \
              load_dir="/content/gdrive/MyDrive/New_York_Data/Clustered"):
  name = get_name(year, month, num_clusters, bin_mins)
  path = os.path.join(load_dir, name)
  if not os.path.exists(path):
    raise Exception("Could not find {}".format(path))
  
  with open(path, 'rb') as f:
    data = np.load(f, allow_pickle=True)
  return data

In [None]:
def get_weekdays(year, month, data, bin_mins):
  start_dt = datetime.date(year, month, 1)
  start_wd = start_dt.weekday()
  day_sec = 60 * 60 * 24
  week_sec = day_sec * 7
  weekdays = []
  for i in range(len(data)):
    secs = i * 60 * bin_mins
    weekday = ((secs % week_sec) // day_sec + start_wd) % 7
    weekdays.append(weekday)
  return weekdays

def load_cluster_centers(year, month, num_clusters, suffix="", \
            load_dir="/content/gdrive/MyDrive/New_York_Data/Clustered",\
      ):
  name = get_cluster_name(year, month, num_clusters)
  path = os.path.join(load_dir, name)
  if not os.path.exists(path):
    raise Exception("Could not find {}".format(path))

  with open(path, 'rb') as f:
    data = np.load(f, allow_pickle=True)
  return data

In [None]:
def full_data(data, cluster_centers, weekdays):
  full_data = []
  num_bins = len(data)
  num_clusters = len(data[0])
  lat = cluster_centers[:, 1]
  lon = cluster_centers[:, 0]
    

def make_seq(data, cluster_centers, weekdays, window_size=5):
  seq_data = []
  num_bins = len(data)
  num_clusters = len(data[0])
  lat = cluster_centers[:, 1]
  lon = cluster_centers[:, 0]
  for i in range(num_clusters):
    for j in range(num_bins - window_size):
      seq = list(data[j:j+window_size, i])
      for item in [lat[i], lon[i], weekdays[j], data[j+window_size][i]]:
        seq.append(item)
      seq_data.append(seq)

  return np.array(seq_data)

def train_test_split(data, weekdays, train_ratio=0.7):
  train_size = int(len(data) * train_ratio)
  train_data = data[:train_size]
  test_data = data[train_size:]
  train_weekdays = weekdays[:train_size]
  test_weekdays = weekdays[train_size:]
  return train_data, test_data, train_weekdays, test_weekdays

In [None]:
data = load_data(year, month, num_clusters, bin_mins)
cluster_centers = load_cluster_centers(year, month, num_clusters)
weekdays = get_weekdays(year, month, data, bin_mins)

In [None]:
train_data, test_data, train_weekdays, test_weekdays = train_test_split(data, weekdays)
train_data = make_seq(train_data, cluster_centers, train_weekdays)
test_data = make_seq(test_data, cluster_centers, test_weekdays)

In [None]:
dummy = []
for i, ct in enumerate(data[-1]):
  lat = cluster_centers[i][0]
  lon = cluster_centers[i][1]
  dummy.append([lat, lon, str(int(ct))])

file_name = "dummy_{}_{}.json".format(num_clusters, bin_mins)

with open(file_name, "w") as f:
  json.dump(dummy, f)

In [None]:
x_train, y_train = train_data[:, :-1], train_data[:, -1]
x_test, y_test = test_data[:, :-1], test_data[:, -1]

In [None]:
def get_full_data(year, month, num_clusters, bin_mins, \
                  load_dir="/content/gdrive/MyDrive/New_York_Data/Clustered"):
  data = load_data(year, month, num_clusters, bin_mins)
  cluster_centers = load_cluster_centers(year, month, num_clusters)
  weekdays = get_weekdays(year, month, data, bin_mins)
  train_data, test_data, train_weekdays, test_weekdays = train_test_split(data, weekdays)
  train_data = make_seq(train_data, cluster_centers, train_weekdays)
  test_data = make_seq(test_data, cluster_centers, test_weekdays)
  x_train, y_train = train_data[:, :-1], train_data[:, -1]
  x_test, y_test = test_data[:, :-1], test_data[:, -1]

  return x_train, y_train, x_test, y_test

# Linear Regression

In [None]:
year = 2016
month = 2
bin_mins = 60
num_clusters = 1000

In [None]:
cleaned_data = load_cleaned_data(year, month)
preprocess2npy(cleaned_data, year, month, num_clusters, bin_mins)

Making clusters...
Saving cluster centers...
/content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_1000.npy already exists, skipping...
Cluster centers saved to /content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_1000.npy ...
Computing cluster grid...
/content/gdrive/MyDrive/New_York_Data/Clustered/cluster_grid_2016-02_200.npy already exists, skipping...
Cluster grid saved to /content/gdrive/MyDrive/New_York_Data/Clustered/cluster_centers_2016-02_1000.npy ...
Making bins...
Saving...
Saved to /content/gdrive/MyDrive/New_York_Data/Clustered/2016-02_1000_60.npy


'/content/gdrive/MyDrive/New_York_Data/Clustered/2016-02_1000_60.npy'

In [None]:
x_train, y_train, x_test, y_test = get_full_data(year, month, num_clusters, bin_mins)

In [None]:
x_train[:15], y_train[:15]

(array([[ 10.        ,   6.        ,   4.        ,   3.        ,
           1.        , -73.97436789,  40.75065413,   0.        ],
        [  6.        ,   4.        ,   3.        ,   1.        ,
           3.        , -73.97436789,  40.75065413,   0.        ],
        [  4.        ,   3.        ,   1.        ,   3.        ,
           9.        , -73.97436789,  40.75065413,   0.        ],
        [  3.        ,   1.        ,   3.        ,   9.        ,
          20.        , -73.97436789,  40.75065413,   0.        ],
        [  1.        ,   3.        ,   9.        ,  20.        ,
          33.        , -73.97436789,  40.75065413,   0.        ],
        [  3.        ,   9.        ,  20.        ,  33.        ,
          36.        , -73.97436789,  40.75065413,   0.        ],
        [  9.        ,  20.        ,  33.        ,  36.        ,
          35.        , -73.97436789,  40.75065413,   0.        ],
        [ 20.        ,  33.        ,  36.        ,  35.        ,
          29.     

In [None]:
y_train.mean()

16.02794398340249

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
def mape(y_true, y_pred):
  err = mean_absolute_error(y_true, y_pred) / (sum(y_true) / len(y_true))
  return err

In [None]:
lr = LinearRegression().fit(x_train, y_train)
pred = lr.predict(x_test)

In [None]:
mape(y_test, np.round(pred))

0.3656852575492794

LR results
- 2016-01_1000_10 first week -> ~52%
- 2016-01_1000_30 first_week -> ~40% 
- 2016-01_40_10 -> ~14%
- 2016-02_500_30 -> ~30%
- 2016-02_1000_30 -> ~38.6%
- 2016-02_250_30 -> ~23.7%
- 2016-02_125_30 -> ~19%
- 2016-02_40_10 -> ~13.3%
- 2016-02_125_10 -> ~21.3%

In [None]:
xg_model = xgb.XGBRegressor(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=2,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 reg_alpha=200, reg_lambda=200,
 colsample_bytree=0.8,nthread=4)
xg_model.fit(x_train, y_train)



XGBRegressor(colsample_bytree=0.8, max_depth=2, min_child_weight=3,
             n_estimators=600, nthread=4, reg_alpha=200, reg_lambda=200,
             subsample=0.8)

In [None]:
pred = xg_model.predict(x_test)
mape(y_test, np.round(pred))

0.1400446824136367

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
regr1 = RandomForestRegressor(max_features='sqrt',min_samples_leaf=9,min_samples_split=7,n_estimators=79, n_jobs=-1)
regr1.fit(x_train, y_train)

RandomForestRegressor(max_features='sqrt', min_samples_leaf=9,
                      min_samples_split=7, n_estimators=79, n_jobs=-1)

In [None]:
def save_prediction(data, year, month, num_clusters, bin_mins, save_dir="/content/gdrive/MyDrive/New_York_Data/Predicted/"):
  name = get_name(year, month, num_clusters, bin_mins)
  save_path = os.path.join(save_dir, name)
  if os.path.exists(save_path):
    print("{} already exists, skipping...".format(save_path))
    return save_path
  with open(save_path, "wb") as f:
    np.save(f, data)
  print(data.shape)
  return save_path

def save_to_json(data, year, month, num_clusters, bin_mins, save_dir="/content/gdrive/MyDrive/New_York_Data/Predicted/"):
  name = get_name(year, month, num_clusters, bin_mins)
  save_path = os.path.join(save_dir, name)
  if os.path.exists(save_path):
    print("{} already exists, skipping...".format(save_path))
    return save_path
  with open(save_path, "wb") as f:
    np.save(f, data)

In [None]:
pred = lr.predict(x_test[:num_clusters])
save_prediction(pred, year, month, num_clusters, bin_mins)

(1000,)


'/content/gdrive/MyDrive/New_York_Data/Predicted/2016-02_1000_60.npy'