In [1]:
!ls data

chicago_taxi_trips_2016_01.csv	chicago_taxi_trips_2016_09.csv
chicago_taxi_trips_2016_02.csv	chicago_taxi_trips_2016_10.csv
chicago_taxi_trips_2016_03.csv	chicago_taxi_trips_2016_11.csv
chicago_taxi_trips_2016_04.csv	chicago_taxi_trips_2016_12.csv
chicago_taxi_trips_2016_05.csv	column_remapping.json
chicago_taxi_trips_2016_06.csv	company_dedupe_map.p
chicago_taxi_trips_2016_07.csv	data_dictionary.csv
chicago_taxi_trips_2016_08.csv


In [2]:
import pandas as pd
import json
import pickle

In [3]:
trips = pd.read_csv('data/chicago_taxi_trips_2016_01.csv')

mapping = pd.read_json('data/column_remapping.json')

with open('data/company_dedupe_map.p', 'rb') as f:
    company_dedupe_map = pickle.load(f)

In [4]:
def prepare(trips):
    # drop zero fares
    n = trips.shape[0]
    trips = trips.dropna(subset=['fare'])
    print(f'{n - trips.shape[0]} rows with 0 fare were dropped')
    
    # drop zero trip_miles
    n = trips.shape[0]
    trips = trips[trips.trip_miles > 0]
    print(f'{n - trips.shape[0]} rows with 0 trip_miles were dropped')
    
    # drop na geo coordinates
    n = trips.shape[0]
    trips = trips.dropna(subset=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'])
    print(f'{n - trips.shape[0]} rows with na geo coordinates were dropped')
    
    # restore real longitude/latitude
    trips['pickup_latitude'] = trips['pickup_latitude'].map(lambda x: mapping.pickup_latitude[x])
    trips['pickup_longitude'] = trips['pickup_longitude'].map(lambda x: mapping.pickup_longitude[x])
    trips['dropoff_latitude'] = trips['dropoff_latitude'].map(lambda x: mapping.dropoff_latitude[x])
    trips['dropoff_longitude'] = trips['dropoff_longitude'].map(lambda x: mapping.dropoff_longitude[x])
    
    # remove duplicates in company names (taken from https://www.kaggle.com/sohier/taxi-revenues-eda)
    trips.company = trips.company.map(company_dedupe_map, na_action='ignore')
    
    # restore company names
    trips.company = trips.company.map(lambda x: mapping['company'][x], na_action='ignore')

    return trips

In [5]:
all_trips = []

# let's keep only the first two months
for i in range(1,3):
    fname = f'data/chicago_taxi_trips_2016_{i:02d}.csv'
    print(f'Reading {fname}')
    df = pd.read_csv(fname)
    all_trips.append(prepare(df))

Reading data/chicago_taxi_trips_2016_01.csv
33 rows with 0 fare were dropped
450266 rows with 0 trip_miles were dropped
160271 rows with na geo coordinates were dropped
Reading data/chicago_taxi_trips_2016_02.csv
10 rows with 0 fare were dropped
458117 rows with 0 trip_miles were dropped
161625 rows with na geo coordinates were dropped


In [6]:
trips_2months = pd.concat(all_trips[:2], axis=0)

In [7]:
trips_2months.to_csv('taxi_trips_2months.csv.gz', compression='gzip')