# Importing Required Libraries 

In [1]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc

# Reading The Data

In [2]:
train =  pd.read_csv('../input/train.csv', nrows = 29000000)

# Data Cleaning

Drop rows with null value

In [3]:
print("Total Rows before data cleaning: ", train.shape[0])

Total Rows before data cleaning:  29000000


In [4]:
train = train.dropna(how = 'any', axis = 'rows')

Remove rows where:
* Fare Amount is less than 0
* Fare Amount is more than $400
* Pickup & Dropoff Latitude & Longitude are 0

In [5]:
train = train.loc[ (train.fare_amount > 0)  & (train.fare_amount <= 300) & (train.pickup_longitude != 0) & (train.pickup_latitude != 0) & (train.dropoff_longitude != 0) & (train.dropoff_latitude != 0)]

In [6]:
print("Total Rows after data cleaning: ", train.shape[0])

Total Rows after data cleaning:  28419871


# Define A Function, that calculates Distance using Haversine Distance Formula and calculates Bearing

https://en.wikipedia.org/wiki/Haversine_formula
https://en.wikipedia.org/wiki/Bearing_(navigation)

In [7]:
def havesine_distance_and_bearing(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude,getBearing):

    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude = map(np.radians, [pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude])
    
    #Compute distances along lat, lon dimensions
    dlatitude = dropoff_latitude - pickup_latitude
    dlongitude = dropoff_longitude - pickup_longitude
    
    #Compute haversine distance
    harversineDistance = np.sin(dlatitude/2.0)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(dlongitude/2.0)**2
    
    
    if getBearing:
        #Compute Bearing Distance
        bearing = np.arctan2(np.sin(dlongitude * np.cos(dropoff_latitude)),np.cos(pickup_latitude) * np.sin(dropoff_latitude) - np.sin(pickup_latitude) * np.cos(dropoff_latitude) * np.cos(dlongitude))

        return 2 * R_earth * np.arcsin(np.sqrt(harversineDistance)), bearing
    else:
        return 2 * R_earth * np.arcsin(np.sqrt(harversineDistance))

# Add Distance From Airport

In [8]:
def add_airport_dist(df):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    NYC: Newyork Central
    SOL: Statue of Liberty 
    JFK: John F. Kennedy International Airport
    LGA: LaGuardia Airport
    EWR: Newark Liberty International Airport
    """
    nyc_coord = (40.7141667,-74.0063889) 
    sol_coord = (40.6892,-74.0445)
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    
    pickup_lat = df['pickup_latitude']
    dropoff_lat = df['dropoff_latitude']
    pickup_lon = df['pickup_longitude']
    dropoff_lon = df['dropoff_longitude']
    
    pickup_jfk = havesine_distance_and_bearing(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1],False) 
    dropoff_jfk = havesine_distance_and_bearing(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon,False) 
    pickup_ewr = havesine_distance_and_bearing(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1],False)
    dropoff_ewr = havesine_distance_and_bearing(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon,False) 
    pickup_lga = havesine_distance_and_bearing(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1],False) 
    dropoff_lga = havesine_distance_and_bearing(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon,False)
    pickup_sol = havesine_distance_and_bearing(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1],False) 
    dropoff_sol = havesine_distance_and_bearing(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon,False)
    pickup_nyc = havesine_distance_and_bearing(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1],False) 
    dropoff_nyc = havesine_distance_and_bearing(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon,False)
    

    df['nyc_dist'] = pickup_nyc + dropoff_nyc
    df['jfk_dist'] = pickup_jfk + dropoff_jfk
    df['ewr_dist'] = pickup_ewr + dropoff_ewr
    df['lga_dist'] = pickup_lga + dropoff_lga
    df['sol_dist'] = pickup_sol + dropoff_sol
    
    return df

# Define Function for getting Date, Month, Year, Weekday, Hour from date column

In [9]:
def get_datetimeinfo(df):
    #Convert to datetime format
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    df['day'] = df.pickup_datetime.dt.day
    df['month'] = df.pickup_datetime.dt.month
    df['year'] = df.pickup_datetime.dt.year
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['hour'] = df.pickup_datetime.dt.hour

    return df

# Apply all the above functions to the data

In [10]:
train = get_datetimeinfo(train)

In [11]:
## Remove Unwanted Columns
train.drop(columns=['key', 'pickup_datetime'], inplace=True)
gc.collect()

98

In [12]:
train['distance'], train['bearing'] = havesine_distance_and_bearing(train['pickup_latitude'], train['pickup_longitude'], train['dropoff_latitude'] , train['dropoff_longitude'],True) 

In [13]:
train = add_airport_dist(train)

# Our Table after feature engineering

In [14]:
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,month,year,weekday,hour,distance,bearing,nyc_dist,jfk_dist,ewr_dist,lga_dist,sol_dist
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,15,6,2009,0,17,1.030764,2.918897,27.572573,20.26584,55.176046,14.342611,34.543548
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,5,1,2010,1,16,8.450134,0.375217,8.755732,44.667679,31.832358,23.130775,15.125872
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,18,8,2011,3,0,1.389525,-2.599961,9.847344,43.597686,33.712082,19.865289,17.722624
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,21,4,2012,5,4,2.79927,-0.133905,7.703421,42.642965,32.556289,21.063132,15.738963
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,9,3,2010,1,7,1.999157,0.502703,15.600745,43.329953,39.406828,15.219339,23.732406


# Get Target Variable in seperate column

In [15]:
y = train['fare_amount']
train = train.drop(columns=['fare_amount'])

# Split the data into train & validation set

In [16]:
train,valid,y,y_valid = train_test_split(train,y,random_state=123,test_size=0.09)
gc.collect()

17

# Apply LightGBM Algorithm

In [17]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 4000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }
    
train = lgbm.Dataset(train, y,categorical_feature=['year','month','day','weekday'])
valid = lgbm.Dataset(valid, y_valid,categorical_feature=['year','month','day','weekday'])
model = lgbm.train(params, train_set = train, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid)
del train
del y
del valid
del y_valid
gc.collect()



Training until validation scores don't improve for 500 rounds.
[500]	valid_0's rmse: 3.57317
[1000]	valid_0's rmse: 3.51153
[1500]	valid_0's rmse: 3.48382
[2000]	valid_0's rmse: 3.46569
[2500]	valid_0's rmse: 3.45255
[3000]	valid_0's rmse: 3.44425
[3500]	valid_0's rmse: 3.43776
[4000]	valid_0's rmse: 3.43124
[4500]	valid_0's rmse: 3.42686
[5000]	valid_0's rmse: 3.42277
[5500]	valid_0's rmse: 3.4193
[6000]	valid_0's rmse: 3.41521
[6500]	valid_0's rmse: 3.41159
[7000]	valid_0's rmse: 3.4093
[7500]	valid_0's rmse: 3.40688
[8000]	valid_0's rmse: 3.40471
[8500]	valid_0's rmse: 3.40312
[9000]	valid_0's rmse: 3.40108
[9500]	valid_0's rmse: 3.3996
[10000]	valid_0's rmse: 3.39767
[10500]	valid_0's rmse: 3.39613
[11000]	valid_0's rmse: 3.39502
[11500]	valid_0's rmse: 3.39379
[12000]	valid_0's rmse: 3.39289
[12500]	valid_0's rmse: 3.39165
[13000]	valid_0's rmse: 3.39086
[13500]	valid_0's rmse: 3.38982
[14000]	valid_0's rmse: 3.38917
[14500]	valid_0's rmse: 3.38797
[15000]	valid_0's rmse: 3.38728


49

# Read Test.CSV

In [18]:
test =  pd.read_csv('../input/test.csv')

# Perform Feature Engineering on Test.CSV

In [19]:
test = get_datetimeinfo(test)
test['distance'], test['bearing'] = havesine_distance_and_bearing(test['pickup_latitude'], test['pickup_longitude'], test['dropoff_latitude'] , test['dropoff_longitude'],True) 
test = add_airport_dist(test)

In [20]:
## Remove Unwanted Columns
test_key = test['key']
test.drop(columns=['key', 'pickup_datetime'], inplace=True)

# Predict on test dataset

In [21]:
test['fare_amount'] = model.predict(test, num_iteration = model.best_iteration)      

# Save CSV File, so we can submit it to competition

In [22]:
test['key'] = test_key
test[['key','fare_amount']].to_csv('submission.csv',index=False)