In [1]:
import datetime
from pathlib import Path
import numpy as np
import pandas as pd
import os
import math
from haversine import haversine

from utils import *

## Q1

### Cab class

In [2]:
class Cab:
    def __init__(self,datas):
        self.datas_np = np.asarray(datas)
        # print(self.datas_np.shape)
        self.free_time_index = self.free_time_search()
        self.free_dis = self.free_time_distance(self.free_time_index)
        
    def free_time_search(self):
        free_time_index= np.argwhere(self.datas_np[:,2]==0)
        return free_time_index
    
    def free_time_distance(self,free_time_index):
        x = self.datas_np[free_time_index,0]
        y = self.datas_np[free_time_index,1]
        dis_sum = 0
        for i in range(len(free_time_index)-1):
            # dis_tmp = math.sqrt( (x[i]-x[i+1])**2 + (y[i]-y[i+1])**2)
            if free_time_index[i+1] - free_time_index[i] >1:
                continue
            p1 = (x[i], y[i])
            p2 = (x[i+1], y[i+1])
            dis_tmp = haversine(p1, p2) * 0.621371 # calculate the distance between two points given their latitude and longitude coordinates
                                                # 0.632472 for convert km to miles
            dis_sum = dis_sum+dis_tmp
        return dis_sum

### Read data, preprocess and change to San Francisco time

In [5]:
path = Path('./cabspottingdata/')
files = os.listdir(path)
files = [x for x in files if x.endswith('.txt')]

df_list = []
cabs_list = []
# read dataset, sorted by local time
for file in files:
    temp_df = pd.read_csv(path / file, sep=' ', names=['latitude', 'longitude', 'occupancy', 'time'])
    temp_df['taxi_name'] = file.replace('.txt', '')
    temp_df['local_time'] = temp_df['time'].apply(target_tz, tz_name='US/Pacific')
    temp_df['local_time'] = temp_df['local_time'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
#     temp_df.sort_values(by=[ 'local_time'], inplace=True, ascending=True)
    
    temp_df.dropna(inplace=True)
    temp_df.sort_values(by=['taxi_name', 'local_time'], inplace=True, ascending=True)
    
    ################
    # temp_df = temp_df.iloc[0:1000] # for making small data samples
    df_list.append(temp_df)
    cabs_list.append(Cab(temp_df))
print(len(cabs_list))

537


### get the free time distance of all the cabs

In [29]:
cabs_free_dis = np.zeros(len(cabs_list))
for i in range(len(cabs_list)):
    cabs_free_dis[i] = cabs_list[i].free_dis
# sorted by distance, return the index 
cabs_sorted_by_dis = sorted(range(len(cabs_free_dis)), key=lambda k: cabs_free_dis[k], reverse=True)


# calculate the number of cabs are replaced by electric cars
num_cabs = len(cabs_list)
num_cabs_replace = 0 
sum_cabs_CO2 = 0 

# Replace the remaining 15% of cars with electric cars every month
for i in range(12):
    num_cabs_replace = num_cabs_replace + int(num_cabs * 0.15)
    num_cabs = num_cabs - int(num_cabs * 0.15)
    sum_cabs_CO2 = sum_cabs_CO2 + np.sum(cabs_free_dis[cabs_sorted_by_dis[0:num_cabs_replace]])*404

### final result

In [29]:
print('potential:',sum_cabs_CO2,' grams')

potential: 4311476496.574932  grams


## for question 2 data save

In [40]:
df = pd.concat(df_list)
df.reset_index(drop=True, inplace=True)
df.to_pickle('full_data.pkl')
df.shape

(11220058, 6)
