In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import AdaBoostRegressor
import time
import datetime
from datetime import date
import re
from sklearn.model_selection import train_test_split

In [2]:
def preProcess( input_numpy):
    print("Initial input shape ",input_numpy.shape)#check the initial shape the matrix
    # remove the rows that have fares less than 0 (does not make sene)
    input_numpy = input_numpy[input_numpy[:,1] >0]
    # remove rows that have number of passengers less than 0 and greater than 6
    input_numpy = input_numpy[input_numpy[:,-1]<7]
    input_numpy = input_numpy[input_numpy[:,-1]>0]
    #check the number of reduced it is reduced to
    #convert the numpy to a dataframe and then remove rows that have missing data
    input_df= pd.DataFrame(input_numpy)
    # drop all the NAN rows
    input_df = input_df.dropna(how = 'any', axis = 'rows')
    #instead of having pickup and drop latitude and longitude, better to have the differnce between the latitude and longitude
    input_df['diff_latitude']= (input_df[6] - input_df[4]).abs()
    input_df['diff_longitude']= (input_df[5] - input_df[3]).abs()
    input_df.drop([3,4,5,6],axis=1)
    input_numpy= input_df.values
    print(input_df.shape)
    return input_numpy

In [3]:
def getFeatureSetandLabel(input_numpy):
    input_labels= input_numpy[:,1]
    input_features = input_numpy[:,2:]
    return input_features, input_labels


In [4]:
import calendar
# this function returns 1 if it the day is a weekend or 0 if its a weekday
def getDays(year, month, day):
    list_days=[]
    for  i in range(0,len(day)):
        days=calendar.weekday(year[i],month[i],day[i])
        if days>=5:
            list_days.append(1)
        else:
            list_days.append(0)
    return list_days

In [5]:
# returned 1 if the cab is being booked at night time(10pm-6am) and 0 if it is booked at day time
def getNightShift(hour):
    list_time=[]
    for  i in range(0,len(hour)):
        if(hour[i]>=0 and hour[i]<6) or(hour[i]==22 or hour[i]==23):
            list_time.append(1)
        else:
            list_time.append(0)
    return list_time

In [6]:
# split the hour:minute:sec format and got only the hour
def splitTimetoHMS(Time):
    TimeFormat = [x.split(":") for x in Time]
    TimeFormat = np.asarray(TimeFormat)
    hour = TimeFormat[:,0]
    hour= list(map(int,hour))
    return hour

In [7]:
# return the year,month and day as lists
def splitDatetoYMD(dates):
    dateFormat = [x.split("-") for x in dates]
    dateFormat = np.asarray(dateFormat)
    year, month, day= dateFormat[:,0], dateFormat[:,1],dateFormat[:,2]
    year= np.asarray(list(map(int,year)))
    month= list(map(int,month))
    day= list(map(int,day))
    return year,month,day

In [8]:
def splitDateTime(input_features):
    dateTime= [x.split(' ') for x in input_features[:,0]]
    dateTime = np.asarray(dateTime)
    return dateTime[:,0], dateTime[:,1]

In [9]:
def getDateandTimeColumnsNumpy(input_features):
    input_features[:,0]= [ x[:-3] for x in input_features[:,0]] # remove the UTC from the end of the column
    dates, time= splitDateTime(input_features)
    year,month,day= splitDatetoYMD(dates)
    hour= splitTimetoHMS(time)
    daysCount=getDays(year.tolist(),month,day)
    daysCount=np.asarray(daysCount)
    NightShiftCount=  getNightShift(hour)
    secsCount = np.asarray(NightShiftCount)
    return year,daysCount,secsCount


In [10]:
def joinColumnsToNumpy(year,daysCount,secsCount,input_features):
    input_features=np.hstack((input_features, year[:,None]))
    input_features=np.hstack((input_features, daysCount[:,None]))
    input_features=np.hstack((input_features, secsCount[:,None]))
    return input_features

In [11]:
df=pd.read_csv('train.csv')
print(df.head())
input_numpy= df.values
print("Done loading the file")

                             key  fare_amount          pickup_datetime  \
0    2009-06-15 17:26:21.0000001          4.5  2009-06-15 17:26:21 UTC   
1    2010-01-05 16:52:16.0000002         16.9  2010-01-05 16:52:16 UTC   
2   2011-08-18 00:35:00.00000049          5.7  2011-08-18 00:35:00 UTC   
3    2012-04-21 04:30:42.0000001          7.7  2012-04-21 04:30:42 UTC   
4  2010-03-09 07:51:00.000000135          5.3  2010-03-09 07:51:00 UTC   

   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  \
0        -73.844311        40.721319         -73.841610         40.712278   
1        -74.016048        40.711303         -73.979268         40.782004   
2        -73.982738        40.761270         -73.991242         40.750562   
3        -73.987130        40.733143         -73.991567         40.758092   
4        -73.968095        40.768008         -73.956655         40.783762   

   passenger_count  
0                1  
1                1  
2                2  
3       

In [12]:
input_numpy= preProcess(input_numpy)
print("After preprocessing the input size is ",input_numpy.shape)

Initial input shape  (1000, 8)
(997, 10)
After preprocessing the input size is  (997, 10)


In [13]:
input_features, input_labels= getFeatureSetandLabel(input_numpy)
print("Done with splitting features and labels")


Done with splitting features and labels


In [14]:
input_year,input_daysCount,input_secsCount= getDateandTimeColumnsNumpy(input_features)
print("Obtained the columns to be added to the numpy")

Obtained the columns to be added to the numpy


In [15]:
input_features = joinColumnsToNumpy(input_year,input_daysCount,input_secsCount,input_features)
print("Added the columns to the numpy")

Added the columns to the numpy


In [16]:
input_features=np.delete(input_features,0, 1)
print("Removed the first feature")

Removed the first feature


In [17]:
print("Input shape ",input_features.shape)

Input shape  (997, 10)


In [18]:
# Can uncomment any model that is needed
# from sklearn.svm import SVR
# clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor()
# from sklearn.linear_model import HuberRegressor, LinearRegression
# clf= HuberRegressor()
# clf = AdaBoostRegressor()
# from sklearn.linear_model import LinearRegression
# clf=LinearRegression()

In [19]:
clf.fit(input_features,input_labels)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
test_df = pd.read_csv('test.csv')
print(test_df.head())
test_df['diff_latitude']= (test_df.dropoff_latitude - test_df.pickup_latitude).abs()
test_df['diff_longitude']= (test_df.dropoff_longitude - test_df.pickup_longitude).abs()
test_df.drop(['dropoff_latitude','dropoff_longitude','pickup_latitude','pickup_latitude'],axis=1)
test_numpy=  test_df.values

                           key          pickup_datetime  pickup_longitude  \
0  2015-01-27 13:08:24.0000002  2015-01-27 13:08:24 UTC        -73.973320   
1  2015-01-27 13:08:24.0000003  2015-01-27 13:08:24 UTC        -73.986862   
2  2011-10-08 11:53:44.0000002  2011-10-08 11:53:44 UTC        -73.982524   
3  2012-12-01 21:12:12.0000002  2012-12-01 21:12:12 UTC        -73.981160   
4  2012-12-01 21:12:12.0000003  2012-12-01 21:12:12 UTC        -73.966046   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0        40.763805         -73.981430         40.743835                1  
1        40.719383         -73.998886         40.739201                1  
2        40.751260         -73.979654         40.746139                1  
3        40.767807         -73.990448         40.751635                1  
4        40.789775         -73.988565         40.744427                1  


In [21]:
test_numpy= test_numpy[:,1:]

In [22]:
year,daysCount,secsCount = getDateandTimeColumnsNumpy(test_numpy)

In [23]:
test_numpy=joinColumnsToNumpy(year,daysCount,secsCount,test_numpy)

In [24]:
test_numpy=np.delete(test_numpy,0, 1)

In [25]:
test_labels= clf.predict(test_numpy)

In [26]:
# saved the result as submission.csv
test_df["fare_amount"]=test_labels
test_df[["key","fare_amount"]].to_csv("submission.csv",index=False)
print("Done")

Done
