In [1]:
import pandas as panda

# mount to google drive next
from google.colab import drive
drive.mount('/content/drive')


# read the file into dataframe
df =   panda.read_csv('/content/drive/MyDrive/25manhattenyellowtaxis.csv')




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df.dtypes

VendorID                 float64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count          float64
trip_distance            float64
RatecodeID               float64
store_and_fwd_flag        object
PULocationID               int64
DOLocationID               int64
payment_type             float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
dtype: object

In [3]:
print(len(df))

74551474


In [4]:
# making sure that the datetimes are in actual datetime format

df['tpep_pickup_datetime']  =    panda.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime']  =     panda.to_datetime(df['tpep_dropoff_datetime'])

In [5]:
# sorting the data

df =    df.sort_values(by='tpep_dropoff_datetime')

In [6]:
print(df.dtypes)

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                    float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object


In [7]:
cols1 = ['store_and_fwd_flag',   'mta_tax',  'improvement_surcharge',    'passenger_count',  'RatecodeID',
  'payment_type',  'VendorID',   'extra']

In [8]:
counts1 =   {col: df[col].value_counts().to_dict() for col in cols1}

In [9]:
counts1

{'store_and_fwd_flag': {'N': 73973495, 'Y': 577979},
 'mta_tax': {0.5: 74441668, 0.0: 109806},
 'improvement_surcharge': {0.3: 74535739, 0.0: 15735},
 'passenger_count': {1.0: 53669028,
  2.0: 11318827,
  3.0: 3197137,
  5.0: 3018518,
  6.0: 1820013,
  4.0: 1527951},
 'RatecodeID': {1.0: 72852653,
  2.0: 1536186,
  5.0: 148747,
  3.0: 11537,
  4.0: 2090,
  6.0: 261},
 'payment_type': {1.0: 54370315,
  2.0: 19699564,
  3.0: 335274,
  4.0: 146320,
  5.0: 1},
 'VendorID': {2.0: 47291639, 1.0: 27018103, 4.0: 241732},
 'extra': {0.0: 27805693,
  0.5: 14234433,
  2.5: 12842614,
  1.0: 8711124,
  3.0: 6447181,
  3.5: 4210852,
  4.5: 213903,
  7.0: 78758,
  0.8: 3062,
  1.3: 1435,
  1.8: 834,
  2.0: 559,
  1.7: 232,
  1.2: 113,
  3.73: 104,
  0.3: 91,
  0.7: 66,
  2.8: 61,
  4.96: 43,
  5.5: 36,
  1.5: 32,
  5.0: 26,
  2.95: 19,
  2.05: 18,
  4.0: 15,
  3.45: 11,
  9.5: 11,
  2.1: 9,
  5.3: 8,
  2.2: 8,
  2.15: 7,
  1.75: 6,
  3.8: 5,
  0.45: 5,
  0.2: 5,
  7.5: 5,
  2.61: 4,
  3.3: 4,
  7.22:

In [10]:
df =   df.drop(columns=['store_and_fwd_flag',   'mta_tax',    'improvement_surcharge'])

In [11]:
cols2 = ['trip_distance',  'fare_amount',   'tip_amount',    'total_amount',   'tolls_amount', 'total_amount']

# get the variance for the columns above
var1 =   df[cols2].var()

# then get the correlation matrix
corr1 =   df[cols2].corr()

var1,    corr1

(trip_distance     1.730422
 fare_amount      28.059809
 tip_amount        2.936208
 total_amount     39.701221
 tolls_amount      0.000000
 total_amount     39.701221
 dtype: float64,
                trip_distance  fare_amount  tip_amount  total_amount  \
 trip_distance       1.000000     0.907014    0.468035      0.877926   
 fare_amount         0.907014     1.000000    0.502304      0.965436   
 tip_amount          0.468035     0.502304    1.000000      0.648859   
 total_amount        0.877926     0.965436    0.648859      1.000000   
 tolls_amount             NaN          NaN         NaN           NaN   
 total_amount        0.877926     0.965436    0.648859      1.000000   
 
                tolls_amount  total_amount  
 trip_distance           NaN      0.877926  
 fare_amount             NaN      0.965436  
 tip_amount              NaN      0.648859  
 total_amount            NaN      1.000000  
 tolls_amount            NaN           NaN  
 total_amount            NaN      1.000

In [12]:
# we will later conside dropping fare and total depending on the feature importance as correlated with trip_distance
#df = df.drop(columns=['fare_amount', 'total_amount'])
df =   df.drop(columns=[ 'tolls_amount'])

In [13]:
from sklearn.preprocessing import LabelEncoder

# make the encoder
labelencoder = LabelEncoder()



# new dataframe
df_encoded = df.copy()

# these are the categoical columns we are going to find the correlations for
cat_cols = ['RatecodeID', 'payment_type', 'VendorID']


# call the label encoder on these
for col in cat_cols:
    df_encoded[col] =   labelencoder.fit_transform(df_encoded[col])


# get the correlation matrix
corr_encoded =   df_encoded.corr()

# get the distribution aswell
cat_distribution = {col: df[col].value_counts().to_dict() for col in cat_cols}

cat_distribution,     corr_encoded

  corr_encoded =   df_encoded.corr()


({'RatecodeID': {1.0: 72852653,
   2.0: 1536186,
   5.0: 148747,
   3.0: 11537,
   4.0: 2090,
   6.0: 261},
  'payment_type': {1.0: 54370315,
   2.0: 19699564,
   3.0: 335274,
   4.0: 146320,
   5.0: 1},
  'VendorID': {2.0: 47291639, 1.0: 27018103, 4.0: 241732}},
                       VendorID  passenger_count  trip_distance  RatecodeID  \
 VendorID              1.000000         0.184891       0.025828    0.014670   
 passenger_count       0.184891         1.000000       0.016460   -0.002950   
 trip_distance         0.025828         0.016460       1.000000    0.171839   
 RatecodeID            0.014670        -0.002950       0.171839    1.000000   
 PULocationID         -0.012590        -0.006378      -0.087927   -0.047796   
 DOLocationID         -0.005546        -0.004565      -0.056158   -0.012146   
 payment_type         -0.011685         0.017115      -0.072875    0.001333   
 fare_amount           0.011180         0.011671       0.907014    0.265849   
 extra                -0.

In [14]:
del df_encoded

In [15]:
# we will consider dopping these columns based on the feature importances after training the model.
#df = df.drop(columns=['extra', 'payment_type'])

In [16]:
df.dtypes

VendorID                        float64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
PULocationID                      int64
DOLocationID                      int64
payment_type                    float64
fare_amount                     float64
extra                           float64
tip_amount                      float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [17]:
print(len(df))

74551474


In [18]:
# Create time-based features

df['dropoff_day_of_month'] =   df['tpep_dropoff_datetime'].dt.day

df['dropoff_day_of_week'] =    df['tpep_dropoff_datetime'].dt.dayofweek

df['dropoff_hour'] =   df['tpep_dropoff_datetime'].dt.hour



df['dropoff_month'] =   df['tpep_dropoff_datetime'].dt.month


# make a feature for the length of the journeys
df['ride_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# rule out extremely small journeys so they dont affect calculation for average speed


df = df[df['ride_duration'] >=  1]


# Create the average speed feature (mph)


df['average_speed'] = df['trip_distance'] / (df['ride_duration'] / 60)


df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['average_speed'] = df['trip_distance'] / (df['ride_duration'] / 60)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,tip_amount,total_amount,congestion_surcharge,dropoff_day_of_month,dropoff_day_of_week,dropoff_hour,dropoff_month,ride_duration,average_speed
54934270,2.0,2018-12-31 12:50:25,2018-12-31 13:11:34,1.0,3.15,1.0,48,140,1.0,15.0,0.5,3.26,19.56,2.5,31,0,13,12,21.15,8.93617
54930183,2.0,2018-12-31 13:12:28,2018-12-31 13:18:01,5.0,0.83,1.0,243,243,2.0,5.5,0.5,0.0,6.8,2.5,31,0,13,12,5.55,8.972973
54957103,2.0,2018-12-31 14:01:24,2018-12-31 14:07:00,1.0,0.89,1.0,163,162,2.0,5.5,0.5,0.0,6.8,2.5,31,0,14,12,5.6,9.535714
54957104,2.0,2018-12-31 14:05:06,2018-12-31 14:08:55,1.0,1.0,1.0,151,41,2.0,5.0,0.5,0.0,6.3,2.5,31,0,14,12,3.816667,15.720524
54944409,2.0,2018-12-31 14:04:54,2018-12-31 14:11:28,5.0,0.7,1.0,230,162,1.0,6.0,0.5,1.46,8.76,2.5,31,0,14,12,6.566667,6.395939


In [19]:
import numpy as nump
mask = nump.isfinite(df['average_speed'])
df =   df[mask]


In [20]:
# get the variance
variancefor_therideduration =    df['ride_duration'].var()

variancefor_theaveragespeed =   df['average_speed'].var()

# get the correlation matrix incuding the new features
corr3 = df[['ride_duration', 'average_speed', 'trip_distance', 'tip_amount', 'fare_amount', 'total_amount']].corr()

variancefor_therideduration,  variancefor_theaveragespeed,    corr3

(4846.438299960062,
 17.54371599741233,
                ride_duration  average_speed  trip_distance  tip_amount  \
 ride_duration       1.000000      -0.148697       0.121145    0.057643   
 average_speed      -0.148697       1.000000       0.262158   -0.005823   
 trip_distance       0.121145       0.262158       1.000000    0.472667   
 tip_amount          0.057643      -0.005823       0.472667    1.000000   
 fare_amount         0.142486      -0.038139       0.918830    0.501354   
 total_amount        0.134292      -0.036848       0.889022    0.649079   
 
                fare_amount  total_amount  
 ride_duration     0.142486      0.134292  
 average_speed    -0.038139     -0.036848  
 trip_distance     0.918830      0.889022  
 tip_amount        0.501354      0.649079  
 fare_amount       1.000000      0.965298  
 total_amount      0.965298      1.000000  )

In [21]:
df2 =  panda.read_csv('/content/drive/MyDrive/taxi_zones (1).csv')

In [22]:
# check if each location id has one area
the_unique_location_ids =     df2['LocationID'].nunique()
unique_shape_areas =      df2.groupby('LocationID')['Shape_Area'].nunique()

# find which ones have multiple areas


the_invalid_location_ids = unique_shape_areas[unique_shape_areas > 1].index.tolist()

# check if every location id is in df2


the_valid_dropoff_ids =   df['DOLocationID'].isin(df2['LocationID']).all()

the_unique_location_ids,    the_invalid_location_ids,     the_valid_dropoff_ids


(260, [56, 103], True)

In [23]:
df =    df[~df['DOLocationID'].isin(the_invalid_location_ids)]

In [24]:
df2 =     df2[~df2['LocationID'].isin(the_invalid_location_ids)]

In [25]:
not_in_zones_pickup_ids =   set(df['PULocationID'].unique()) - set(df2['LocationID'].unique())
not_in_zones_dropoff_ids =    set(df['DOLocationID'].unique()) - set(df2['LocationID'].unique())

not_in_zones_pickup_ids,      not_in_zones_dropoff_ids

({56, 57, 105, 264, 265}, set())

In [26]:
df =   df[~df['PULocationID'].isin(not_in_zones_pickup_ids)]
df =     df[~df['DOLocationID'].isin(not_in_zones_dropoff_ids)]

In [27]:
print(len(df))

73975745


In [28]:
# map locationid to shape_area
location_to_area = df2.set_index('LocationID')['Shape_Area'].to_dict()

location_to_perimeter = df2.set_index('LocationID')['Shape_Leng'].to_dict()

# new feature in df for the area of a dropoff location
df['DOLocation_area'] = df['DOLocationID'].map(location_to_area)

df['DOLocation_perimeter'] = df['DOLocationID'].map(location_to_perimeter)

df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,...,total_amount,congestion_surcharge,dropoff_day_of_month,dropoff_day_of_week,dropoff_hour,dropoff_month,ride_duration,average_speed,DOLocation_area,DOLocation_perimeter
54934270,2.0,2018-12-31 12:50:25,2018-12-31 13:11:34,1.0,3.15,1.0,48,140,1.0,15.0,...,19.56,2.5,31,0,13,12,21.15,8.93617,0.000114,0.047584
54930183,2.0,2018-12-31 13:12:28,2018-12-31 13:18:01,5.0,0.83,1.0,243,243,2.0,5.5,...,6.8,2.5,31,0,13,12,5.55,8.972973,0.000438,0.094331
54957103,2.0,2018-12-31 14:01:24,2018-12-31 14:07:00,1.0,0.89,1.0,163,162,2.0,5.5,...,6.8,2.5,31,0,14,12,5.6,9.535714,4.8e-05,0.03527
54957104,2.0,2018-12-31 14:05:06,2018-12-31 14:08:55,1.0,1.0,1.0,151,41,2.0,5.0,...,6.3,2.5,31,0,14,12,3.816667,15.720524,0.000143,0.052793
54944409,2.0,2018-12-31 14:04:54,2018-12-31 14:11:28,5.0,0.7,1.0,230,162,1.0,6.0,...,8.76,2.5,31,0,14,12,6.566667,6.395939,4.8e-05,0.03527


In [29]:
# make a feature rounded the datetime to the nearest hour
df['dropoff_datetime_hour'] = df['tpep_dropoff_datetime'].dt.floor('H')

# For dropoff
dropoff_counts_hourly = df.groupby(['DOLocationID', 'dropoff_datetime_hour']).size().reset_index(name='number_taxis_DOLocationID_for_this_hour')

# merge it
df = panda.merge(df, dropoff_counts_hourly, how='left', left_on=['DOLocationID', 'dropoff_datetime_hour'], right_on=['DOLocationID', 'dropoff_datetime_hour'])


df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,...,dropoff_day_of_month,dropoff_day_of_week,dropoff_hour,dropoff_month,ride_duration,average_speed,DOLocation_area,DOLocation_perimeter,dropoff_datetime_hour,number_taxis_DOLocationID_for_this_hour
0,2.0,2018-12-31 12:50:25,2018-12-31 13:11:34,1.0,3.15,1.0,48,140,1.0,15.0,...,31,0,13,12,21.15,8.93617,0.000114,0.047584,2018-12-31 13:00:00,1
1,2.0,2018-12-31 13:12:28,2018-12-31 13:18:01,5.0,0.83,1.0,243,243,2.0,5.5,...,31,0,13,12,5.55,8.972973,0.000438,0.094331,2018-12-31 13:00:00,1
2,2.0,2018-12-31 14:01:24,2018-12-31 14:07:00,1.0,0.89,1.0,163,162,2.0,5.5,...,31,0,14,12,5.6,9.535714,4.8e-05,0.03527,2018-12-31 14:00:00,2
3,2.0,2018-12-31 14:05:06,2018-12-31 14:08:55,1.0,1.0,1.0,151,41,2.0,5.0,...,31,0,14,12,3.816667,15.720524,0.000143,0.052793,2018-12-31 14:00:00,1
4,2.0,2018-12-31 14:04:54,2018-12-31 14:11:28,5.0,0.7,1.0,230,162,1.0,6.0,...,31,0,14,12,6.566667,6.395939,4.8e-05,0.03527,2018-12-31 14:00:00,2


In [30]:
print(len(df))

73975745


In [31]:
x = df['DOLocationID'].nunique()
x

66

In [32]:
# aggregate the dataframe so that theres one row per hour for the prediction
aggs = {
    'VendorID': lambda x: x.mode()[0] if x.mode().size > 0 else None,
    'trip_distance': 'mean',
    'fare_amount': 'mean',
    'total_amount': 'mean',
    'extra': 'mean',
    'RatecodeID': lambda x: x.mode()[0] if x.mode().size > 0 else None,
    'payment_type': lambda x: x.mode()[0] if x.mode().size > 0 else None,
    'tip_amount': 'mean',
    'congestion_surcharge': lambda x: 1 if x.mode()[0] == 2.50 else 0,
    'ride_duration': 'mean',
    'average_speed': 'mean',
    'dropoff_hour': 'first',
    'DOLocation_area': 'first',
    'DOLocation_perimeter': 'first',
    'number_taxis_DOLocationID_for_this_hour': 'first',
    'dropoff_day_of_week': 'first',
    'dropoff_day_of_month': 'first',
    'dropoff_month': 'first',
    'passenger_count': 'mean'
}

# make the aggregated dataframe next
df_aggregated = df.groupby(['dropoff_datetime_hour', 'DOLocationID']).agg(aggs).reset_index()



# Display a few of the first rows
df_aggregated.head()



Unnamed: 0,dropoff_datetime_hour,DOLocationID,VendorID,trip_distance,fare_amount,total_amount,extra,RatecodeID,payment_type,tip_amount,...,ride_duration,average_speed,dropoff_hour,DOLocation_area,DOLocation_perimeter,number_taxis_DOLocationID_for_this_hour,dropoff_day_of_week,dropoff_day_of_month,dropoff_month,passenger_count
0,2018-12-31 13:00:00,140,2.0,3.15,15.0,19.56,0.5,1.0,1.0,3.26,...,21.15,8.93617,13,0.000114,0.047584,1,0,31,12,1.0
1,2018-12-31 13:00:00,243,2.0,0.83,5.5,6.8,0.5,1.0,2.0,0.0,...,5.55,8.972973,13,0.000438,0.094331,1,0,31,12,5.0
2,2018-12-31 14:00:00,41,2.0,1.0,5.0,6.3,0.5,1.0,2.0,0.0,...,3.816667,15.720524,14,0.000143,0.052793,1,0,31,12,1.0
3,2018-12-31 14:00:00,48,2.0,2.32,11.0,12.786667,0.5,1.0,1.0,0.486667,...,13.788889,9.586343,14,9.4e-05,0.043747,3,0,31,12,4.333333
4,2018-12-31 14:00:00,68,2.0,1.46,7.5,11.44,0.5,1.0,1.0,2.64,...,7.85,11.159236,14,0.000111,0.049337,1,0,31,12,1.0


In [33]:
print(len(df_aggregated))

554659


In [34]:
df_aggregated.dtypes

dropoff_datetime_hour                      datetime64[ns]
DOLocationID                                        int64
VendorID                                          float64
trip_distance                                     float64
fare_amount                                       float64
total_amount                                      float64
extra                                             float64
RatecodeID                                        float64
payment_type                                      float64
tip_amount                                        float64
congestion_surcharge                                int64
ride_duration                                     float64
average_speed                                     float64
dropoff_hour                                        int64
DOLocation_area                                   float64
DOLocation_perimeter                              float64
number_taxis_DOLocationID_for_this_hour             int64
dropoff_day_of

In [35]:
df_aggregated.to_csv('/content/drive/MyDrive/320manhattenyellowtaxis.csv', index=False)
