In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, MinMaxScaler
from numpy import loadtxt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score


pd.set_option('display.max_rows', 100)
SEED = 8

In [30]:
def change_date(date_time_str):
    date_time_obj = datetime.datetime.strptime(date_time_str, '%m/%d/%Y %H:%M')
    d_time = date_time_obj.time()
    return [(d_time.hour * 60 + d_time.minute), date_time_obj.weekday()]


def get_distance(lat1, lon1, lat2, lon2):
    R = 6373.0
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)

    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    distance = R * c
    #     distance = np.where(distance==0.0, -1, distance)
    thresh_distance = np.where(distance < 1, 1, distance)

    #     print("Result:", distance)
    return thresh_distance, distance

In [26]:
def prepare_dataframe(df):
    pickup_time = np.array([change_date(x) for x in df['pickup_time'].values])
    df['pickup_time'], df['week_day'] = pickup_time[:, 0], pickup_time[:, 1]

    drop_time = np.array([change_date(x) for x in df['drop_time'].values])
    df['drop_time'] = drop_time[:, 0]

    df['lat_dif'] = np.abs(df['pick_lat'] - df['drop_lat'])
    df['lon_dif'] = np.abs(df['pick_lon'] - df['drop_lon'])

    df['trip_time'] = df['duration'] - df['meter_waiting_till_pickup'] - df['meter_waiting']
    df['mobile_fare'] = df['fare'] - df['additional_fare'] - df['meter_waiting_fare']

    df['trip_distance'], df['acct_trip_distance'] = get_distance(df['pick_lat'], df['pick_lon'], df['drop_lat'],
                                                                 df['drop_lon'])
    df['per_km'] = df['fare'] / df['trip_distance']
    df['duration'] = df['duration'] / 60

    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, pd.get_dummies(df['week_day'].values)], axis=1)

    bins = [x * 60 for x in range(24)]
    df['time_bin'] = np.searchsorted(bins, df['pickup_time'].values)

    return df


In [25]:
def get_cleaned_df(train=True):
    if train:
        df = pd.read_csv("train.csv")
        df.dropna(inplace=True)

        df = prepare_dataframe(df)

        labels = df['label'] == 'correct'
        labels = labels.astype('int')
        df = df.drop(
            columns=['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'trip_distance', 'tripid', 'week_day', 'label'])

        scaled_df = scale_data(df)

        return scaled_df, labels

    else:
        df = pd.read_csv("test.csv")
        df = prepare_dataframe(df)
        df = df.drop(
            columns=['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'trip_distance', 'tripid', 'week_day'])
        scaled_df = scale_data(df, train=False)

        return scaled_df

In [27]:
def scale_data(df, train=True):
    global robust_scaler

    if train:
        x_scaled = robust_scaler.fit_transform(df.values)
        scaled = pd.DataFrame(x_scaled, columns=df.columns)

    else:
        x_scaled = robust_scaler.transform(df.values)
        scaled = pd.DataFrame(x_scaled, columns=df.columns)

    return scaled

In [150]:
train_df = pd.read_csv("train.csv")

In [151]:
booster_params = {
    'objective': 'binary:logistic'
    } 

results = {}

In [152]:
train_df['lat']  = abs(train_df['pick_lat']-train_df['drop_lat'])
train_df['lon']  = abs(train_df['pick_lon']-train_df['drop_lon'])

train_df['remaining_fare'] = train_df['fare']-train_df['additional_fare']-train_df['meter_waiting_fare']

train_df['mobile_time'] = train_df['duration']-train_df['meter_waiting']-train_df['meter_waiting_till_pickup']

train_df['fare'] = train_df['fare'].fillna(0)
train_df['duration'] = train_df['duration'] / 60

train_df['trip_distance'], train_df['acct_trip_distance'] = get_distance(train_df['pick_lat'], train_df['pick_lon'], train_df['drop_lat'],
                                                                 train_df['drop_lon'])

train_df['per_km'] = train_df['remaining_fare'] / train_df['trip_distance']

In [153]:
features_df = train_df.drop(['pick_lat','pick_lon','drop_lat','drop_lon',"pickup_time","drop_time", "label",'tripid','trip_distance'], axis = 1)

In [154]:
labels = train_df['label'] == 'correct'

In [155]:
labels = labels.astype('int')

In [156]:
features_df

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,lat,lon,remaining_fare,mobile_time,acct_trip_distance,per_km
0,10.5,13.900000,56.0,0.0000,64.0,270.32,0.04078,0.0210,259.8200,714.0,5.094369,51.001413
1,10.5,13.183333,47.0,0.0000,134.0,197.85,0.02784,0.0061,187.3500,610.0,3.169052,59.118624
2,10.5,18.116667,80.0,0.0000,61.0,301.64,0.02830,0.0495,291.1400,946.0,6.307375,46.158667
3,10.5,9.966667,271.0,15.6638,68.0,82.30,0.00178,0.0076,56.1362,259.0,0.862217,56.136200
4,,,,,,358.39,0.02963,0.0675,,,8.150340,
...,...,...,...,...,...,...,...,...,...,...,...,...
17171,10.5,13.966667,93.0,5.4219,451.0,198.26,0.00182,0.0190,182.3381,294.0,2.106037,86.578792
17172,10.5,35.850000,428.0,0.0000,39.0,581.23,0.04520,0.0873,570.7300,1684.0,10.871789,52.496419
17173,10.5,4.383333,9.0,0.0000,110.0,76.20,0.00925,0.0017,65.7000,144.0,1.045847,62.819915
17174,10.5,14.300000,115.0,0.0000,317.0,133.31,0.00966,0.0242,122.8100,426.0,2.879981,42.642648


In [171]:
imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
features = pd.DataFrame(imp_freq.fit_transform(features_df),columns=features_df.columns)

In [172]:
min_max_scalar = MinMaxScaler()
features = pd.DataFrame(robust_scaler.fit_transform(features),columns=features.columns)

In [173]:
features

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,lat,lon,remaining_fare,mobile_time,acct_trip_distance,per_km
0,0.0,-0.129187,-0.201893,0.000000,-0.067227,0.252317,0.718133,0.276265,0.333963,0.016251,0.400948,-0.016327
1,0.0,-0.163477,-0.230284,0.000000,0.521008,-0.048529,0.323289,-0.303502,0.010153,-0.096425,-0.010765,0.369220
2,0.0,0.072568,-0.126183,0.000000,-0.092437,0.382336,0.337326,1.385214,0.473906,0.267606,0.660340,-0.246345
3,0.0,-0.317384,0.476341,1.119835,-0.033613,-0.528213,-0.471890,-0.245136,-0.576134,-0.476706,-0.504063,0.227563
4,0.0,-0.767943,-0.378549,0.000000,-0.605042,0.617923,0.377908,2.085603,-0.603551,-0.321777,1.054443,-0.063891
...,...,...,...,...,...,...,...,...,...,...,...,...
17171,0.0,-0.125997,-0.085174,0.387622,3.184874,-0.046827,-0.470669,0.198444,-0.012241,-0.438787,-0.238082,1.673509
17172,0.0,0.921053,0.971609,0.000000,-0.277311,1.543002,0.853002,2.856031,1.723166,1.067172,1.636403,0.054682
17173,0.0,-0.584530,-0.350158,0.000000,0.319328,-0.553536,-0.243955,-0.474708,-0.533401,-0.601300,-0.464795,0.545022
17174,0.0,-0.110048,-0.015773,0.000000,2.058824,-0.316455,-0.231444,0.400778,-0.278223,-0.295775,-0.072581,-0.413347


In [174]:
robust_scaler = RobustScaler()

In [175]:
X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.33, shuffle=True, stratify=labels,random_state=SEED)

trip_train = xgb.DMatrix(X_train, label=y_train)
trip_test = xgb.DMatrix(X_eval, label=y_eval)

In [176]:
trip_model = xgb.train(booster_params,trip_train,num_boost_round=100,evals=[(trip_test, 'val')], early_stopping_rounds=10,evals_result=results,verbose_eval=0)
preds = trip_model.predict(trip_test)
preds = np.round(preds)

f1_score(y_eval,preds)

0.9747396837639799

In [177]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

preds = model.predict(X_eval)
f1_score(y_eval,preds)

0.9753086419753086

In [49]:
features, labels = get_cleaned_df()

In [50]:
X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.33, shuffle=True, stratify=labels,random_state=SEED)

trip_train = xgb.DMatrix(X_train, label=y_train)
trip_test = xgb.DMatrix(X_eval, label=y_eval)

In [51]:
trip_model = xgb.train(booster_params,trip_train,num_boost_round=100,evals=[(trip_test, 'val')], early_stopping_rounds=10,evals_result=results,verbose_eval=0)
preds = trip_model.predict(trip_test)
preds = np.round(preds)

f1_score(y_eval,preds)

0.9736308316430021

In [52]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)

preds = model.predict(X_eval)
f1_score(y_eval,preds)

0.9743044822256569

In [None]:
trip_model.predict