In [2]:
# Initialization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt, pi
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import shap

In [3]:
# Read train data (10%) as dataframes
train_df = pd.read_csv('train.csv', nrows = 5000000)

In [4]:
# Transfer csv file to libsvm file for convenience
import csv
def csv_to_libsvm(file1, file2):
    csv_csv = csv.reader(open(file1,'r') )
    csv_list = []
    for i in csv_csv:
        csv_list.append(i)
    csv_list = csv_list[1:len(csv_list)]
    fl = open(file2,'w')
    length = len(csv_list[0])
    for i in csv_list:
        fl.write(str(i[0]))
        for j in range(1,length):
            fl.write(' '+str(j)+':'+str(i[j]))
        fl.write('\n')

# Add a features 'distance' to dataframe
def add_distance(df):
    distance = []
    lat1_r = df['pickup_latitude'].apply(lambda x: radians(x))
    lat2_r = df['dropoff_latitude'].apply(lambda x: radians(x))
    lon1_r = df['pickup_longitude'].apply(lambda x: radians(x))
    lon2_r = df['dropoff_longitude'].apply(lambda x: radians(x))
    
    dlon = (lon2_r - lon1_r).abs()
    dlat = (lat2_r - lat1_r).abs()
    
    dlat = dlat.apply(lambda x: sin(x/2) ** 2)
    lat1_r = lat1_r.apply(lambda x: cos(x))
    lat2_r = lat2_r.apply(lambda x: cos(x))
    dlon = dlon.apply(lambda x: sin(x/2) ** 2)
    a = dlat + lat1_r * lat2_r * dlon
    c = a.apply(lambda x: 2 * asin(sqrt(x)))
    r = 6371
    df['distance'] = c * r * 1000
    
def compute_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):

    lat1_r, lon1_r, lat2_r, lon2_r = map(np.radians,[pickup_lat, pickup_lon, dropoff_lat, dropoff_lon])
    dlon = (lon2_r - lon1_r).abs()
    dlat = (lat2_r - lat1_r).abs()
    
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    r = 6371
    return 2 * r * np.arcsin(np.sqrt(a))

# Add two features 'hour' and 'weekday' to dataframe
def add_datetime_info(dataset):
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    
# Add all time features to dataframe
def add_datetime_info(dataset):
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
# Add the minimum distance to three airports to dataframe
def add_airport_dist(dataset):
    jfk = (40.639722, -73.778889)
    ewr = (40.6925, -74.168611)
    lga = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = compute_distance(pickup_lat, pickup_lon, jfk[0], jfk[1]) 
    dropoff_jfk = compute_distance(jfk[0], jfk[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = compute_distance(pickup_lat, pickup_lon, ewr[0], ewr[1])
    dropoff_ewr = compute_distance(ewr[0], ewr[1], dropoff_lat, dropoff_lon) 
    pickup_lga = compute_distance(pickup_lat, pickup_lon, lga[0], lga[1]) 
    dropoff_lga = compute_distance(lga[0], lga[1], dropoff_lat, dropoff_lon)
    
    pickup_jfk = pickup_jfk.tolist()
    dropoff_jfk = dropoff_jfk.tolist()
    pickup_ewr = pickup_ewr.tolist()
    dropoff_ewr = dropoff_ewr.tolist()
    pickup_lga = pickup_lga.tolist()
    dropoff_lga = dropoff_lga.tolist()
    
    min_jfk = []
    min_ewr = []
    min_lga = []
    
    for i in range(len(pickup_jfk)):
        min_jfk.append(min(pickup_jfk[i], dropoff_jfk[i]))
        min_ewr.append(min(pickup_ewr[i], dropoff_ewr[i]))
        min_lga.append(min(pickup_lga[i], dropoff_lga[i]))
        
    dataset['jfk_dist'] = min_jfk
    dataset['ewr_dist'] = min_ewr
    dataset['lga_dist'] = min_lga

In [5]:
# Add two features 'diff_lon' and 'diff_lat' to train and test dataframe
def add_diff_lon_lat(df):
    df['diff_lon'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['diff_lat'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_diff_lon_lat(train_df)

# Remove the data with null value
print('The size of data before processing: ', len(train_df))
print("Removing null data...")
train_df = train_df[(train_df.pickup_longitude > -180) & (train_df.pickup_longitude != 0) & (train_df.pickup_latitude != 0)]
print('The size of data after processing: ', len(train_df))

# Remove the data with strange distance value
print('\nThe size of data before processing: ', len(train_df))
print("Removing strange data...")
train_df = train_df[(train_df.diff_lon < 5.0) & (train_df.diff_lat < 5.0)]
print('The size of data after processing: ', len(train_df))

# Remove the data with strange latitude and longitude
print('\nThe size of data before processing: ', len(train_df))
print("Removing strange data...")
train_df = train_df[(train_df.pickup_longitude < -73) & (train_df.pickup_longitude > -75)]
train_df = train_df[(train_df.pickup_latitude < 42) & (train_df.pickup_latitude > 40)]
train_df = train_df[(train_df.dropoff_longitude < -73) & (train_df.dropoff_longitude > -75)]
train_df = train_df[(train_df.dropoff_latitude < 42) & (train_df.dropoff_latitude > 40)]
print('The size of data after processing: ', len(train_df))

# Add 'distance' to both train and test dataframe
add_distance(train_df)
add_datetime_info(train_df)
add_airport_dist(train_df)

The size of data before processing:  5000000
Removing null data...
The size of data after processing:  4904621

The size of data before processing:  4904621
Removing strange data...
The size of data after processing:  4898989

The size of data before processing:  4898989
Removing strange data...
The size of data after processing:  4894601


In [6]:
train_x_lgbm = train_df[['distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'hour', 'weekday', 'jfk_dist', 'ewr_dist', 'lga_dist']]
train_y_lgbm = train_df['fare_amount']

params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':20000
    }

x_train,x_test,y_train,y_test = train_test_split(train_x_lgbm, train_y_lgbm, random_state=123, test_size=0.10)

train_set = lgbm.Dataset(x_train, y_train, silent=False)
valid_set = lgbm.Dataset(x_test, y_test, silent=False)
model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)



Training until validation scores don't improve for 500 rounds.
[500]	valid_0's rmse: 3.7164
[1000]	valid_0's rmse: 3.67149
[1500]	valid_0's rmse: 3.64893
[2000]	valid_0's rmse: 3.63605
[2500]	valid_0's rmse: 3.62797
[3000]	valid_0's rmse: 3.62211
[3500]	valid_0's rmse: 3.61477
[4000]	valid_0's rmse: 3.61102
[4500]	valid_0's rmse: 3.60776
[5000]	valid_0's rmse: 3.60565
[5500]	valid_0's rmse: 3.60189
[6000]	valid_0's rmse: 3.60002
[6500]	valid_0's rmse: 3.59809
[7000]	valid_0's rmse: 3.59736
[7500]	valid_0's rmse: 3.59595
[8000]	valid_0's rmse: 3.59444
[8500]	valid_0's rmse: 3.59332
[9000]	valid_0's rmse: 3.59232
[9500]	valid_0's rmse: 3.59199
Early stopping, best iteration is:
[9145]	valid_0's rmse: 3.59194


In [7]:
import tkinter #导入tkinter模块
import numpy as np

root  = tkinter.Tk()
width = 340
height = 540
root.minsize(width, height)
root.title('NYC Taxi Fare Predictor')

# Author
author = tkinter.StringVar()
author.set("Developed by Di Wu / Shuhao Qiao / Yunfei Wang ")
author_label = tkinter.Label(root, font = ('Helvetica', 12, 'italic'), anchor = 'se', textvariable = author)
author_label.place(height = height, width = width)

fare_title = tkinter.StringVar()
fare_title.set("The taxi fare is predicted as")
fare_title_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), fg = "#4F4F4F", anchor = 'w', textvariable = fare_title)
fare_title_label.place(x = 35, y = 50, width = width)

line = tkinter.StringVar()
line.set("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")
line_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), fg = "#4F4F4F", anchor = 'center', textvariable = line)
line_label.place(y = 180, width = width)

fare = tkinter.StringVar()
fare.set("$0.00")
fare_label = tkinter.Label(root, font = ('Helvetica', 90, 'italic'), fg = "#4F4F4F", anchor = 'center', textvariable = fare)
fare_label.place(y = 90, width = width)

pu_lat = tkinter.StringVar()
pu_lat.set("Pick-up latitude:")
pu_lat_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), anchor = 'w', textvariable = pu_lat)
pu_lat_label.place(y = 280, x = 40)

pu_lon = tkinter.StringVar()
pu_lon.set("Pick-up longitude:")
pu_lon_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), anchor = 'w', textvariable = pu_lon)
pu_lon_label.place(y = 310, x = 40)

do_lat = tkinter.StringVar()
do_lat.set("Drop-off latitude:")
do_lat_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), anchor = 'w', textvariable = do_lat)
do_lat_label.place(y = 340, x = 40)

do_lon = tkinter.StringVar()
do_lon.set("Drop-off longitude:")
do_lon_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), anchor = 'w', textvariable = do_lon)
do_lon_label.place(y = 370, x = 40)

time = tkinter.StringVar()
time.set("Time:")
time_label = tkinter.Label(root, font = ('Helvetica', 14, 'italic'), anchor = 'w', textvariable = time)
time_label.place(y = 400, x = 40)

lists = []

bt_rdm = tkinter.Button(root, text = 'Random', font = ('Helvetica', 18, 'italic'), command = lambda: Random())
bt_rdm.place(x = 50, y = 440, width = 110, height = 50)

bt_pre = tkinter.Button(root, text = 'Predict', font = ('Helvetica', 18, 'italic'), command = lambda: Predict())
bt_pre.place(x = 180, y = 440, width = 110, height = 50)

tx_pu_lat = tkinter.Text(root, font = ('Helvetica', 20))
tx_pu_lat.insert('1.0','40.8090')
tx_pu_lat.place(y = 275, x = 170, width = 130, height = 30)

tx_pu_lon = tkinter.Text(root, font = ('Helvetica', 20))
tx_pu_lon.place(y = 305, x = 170, width = 130, height = 30)
tx_pu_lon.insert('1.0','-73.9613')

tx_do_lat = tkinter.Text(root, font = ('Helvetica', 20))
tx_do_lat.place(y = 335, x = 170, width = 130, height = 30)
tx_do_lat.insert('1.0','40.7581')

tx_do_lon = tkinter.Text(root, font = ('Helvetica', 20))
tx_do_lon.place(y = 365, x = 170, width = 130, height = 30)
tx_do_lon.insert('1.0','-73.9855')

tx_time = tkinter.Text(root, font = ('Helvetica', 20))
tx_time.place(y = 395, x = 80, width = 250, height = 30)
tx_time.insert('1.0', '2018-12-13 20:00:00 UTC')

pick = tkinter.StringVar()
pick.set("PICK")
pick_label = tkinter.Label(root, font = ('Helvetica', 12, 'italic', 'underline'), anchor = 'w', textvariable = pick)
pick_label.place(y = 220, x = 30)

drop = tkinter.StringVar()
drop.set("DROP")
drop_label = tkinter.Label(root, font = ('Helvetica', 12, 'italic', 'underline'), anchor = 'w', textvariable = drop)
drop_label.place(y = 240, x = 30)

# Positions

cu = tkinter.Button(root, text = 'Columbia', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_pick_up(40.8091, -73.9613))
cu.place(x = 70, y = 220, width = 55, height = 20)
ts = tkinter.Button(root, text = 'Time Square', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_pick_up(40.7576, -73.9858))
ts.place(x = 125, y = 220, width = 70, height = 20)
jfk = tkinter.Button(root, text = 'JFK', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_pick_up(40.6455, -73.7854))
jfk.place(x = 195, y = 220, width = 30, height = 20)
un = tkinter.Button(root, text = 'United Nations', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_pick_up(40.7507, -73.9677))
un.place(x = 225, y = 220, width = 80, height = 20)

bp = tkinter.Button(root, text = 'Battery Park', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_drop_off(40.7039, -74.0153))
bp.place(x = 70, y = 240, width = 75, height = 20)
fs = tkinter.Button(root, text = 'Flushing', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_drop_off(40.7591, -73.8301))
fs.place(x = 145, y = 240, width = 55, height = 20)
ct = tkinter.Button(root, text = 'China Town', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_drop_off(40.7164, -73.9961))
ct.place(x = 200, y = 240, width = 70, height = 20)
ikea = tkinter.Button(root, text = 'IKEA', font = ('Helvetica', 12, 'italic'), anchor='center', command = lambda: set_drop_off(40.6721, -74.0112))
ikea.place(x = 270, y = 240, width = 35, height = 20)


def set_pick_up(a,b):
    tx_pu_lat.delete('1.0', '2.0')
    tx_pu_lat.insert('1.0', a)
    tx_pu_lon.delete('1.0', '2.0')
    tx_pu_lon.insert('1.0', b)

def set_drop_off(a,b):
    tx_do_lat.delete('1.0', '2.0')
    tx_do_lat.insert('1.0', a)
    tx_do_lon.delete('1.0', '2.0')
    tx_do_lon.insert('1.0', b)

def Random():
    tx_pu_lat.delete('1.0', '2.0')
    tx_pu_lat.insert('1.0', round(-74.1+0.3*np.random.rand(), 4))

    tx_pu_lon.delete('1.0', '2.0')
    tx_pu_lon.insert('1.0', round(40.6+0.3*np.random.rand(), 4))

    tx_do_lat.delete('1.0', '2.0')
    tx_do_lat.insert('1.0', round(-74.1+0.3*np.random.rand(), 4))

    tx_do_lon.delete('1.0', '2.0')
    tx_do_lon.insert('1.0', round(40.6+0.3*np.random.rand(), 4))

    tx_time.delete('1.0', '2.0')
    tx_time.insert('1.0', '2018-12-'+str(13+round(17*np.random.rand()))+' '+str(round(24*np.random.rand()))+':00:00 UTC')

def Predict():
    pickup_latitude = float(tx_pu_lat.get('1.0', '2.0'))
    pickup_longitude = float(tx_pu_lon.get('1.0', '2.0'))
    dropoff_latitude = float(tx_do_lat.get('1.0', '2.0'))
    dropoff_longitude = float(tx_do_lon.get('1.0', '2.0'))
    pickup_datetime = tx_time.get('1.0', 'end-1c')
    passenger_count = 2
    test = {'pickup_longitude': [pickup_longitude], 'pickup_latitude': [pickup_latitude], 'dropoff_longitude': [dropoff_longitude], 'dropoff_latitude': [dropoff_latitude], 'passenger_count': [passenger_count], 'pickup_datetime': [pickup_datetime]}
    test = pd.DataFrame(data = test)

    add_distance(test)
    add_datetime_info(test)
    add_airport_dist(test)

    test = test[['distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'hour', 'weekday', 'jfk_dist', 'ewr_dist', 'lga_dist']]
    prediction = model.predict(test, num_iteration = model.best_iteration)

    fare.set('$'+str(round(prediction[0], 2)))

root.mainloop()

KeyboardInterrupt: 