In [20]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from numpy import savetxt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from bayes_opt import BayesianOptimization

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

RANDOM_SEED = 8

In [21]:
simple_imputer = SimpleImputer(strategy='most_frequent')
robust_scaler = RobustScaler()

In [22]:
def clean_dataframe(train=True):
    if train:
        df = pd.read_csv("train.csv", index_col="tripid")
    else:
        df = pd.read_csv("test.csv", index_col="tripid")

    df['checkout_datetime'] = pd.to_datetime(df['drop_time'])
    df['checkin_datetime'] = pd.to_datetime(df['pickup_time'])
    df['duration_in_minutes'] = (df['checkout_datetime'] - df['checkin_datetime']) / pd.Timedelta(
        minutes=1)
    df['lat'] = abs(df['pick_lat'] - df['drop_lat'])
    df['lon'] = abs(df['pick_lon'] - df['drop_lon'])

    df['date_time'] = pd.to_datetime(df['drop_time'])

    df['remaining_fare'] = df['fare'] - df['additional_fare'] - df['meter_waiting_fare']

    df['mobile_time'] = df['duration'] - df['meter_waiting'] - df['meter_waiting_till_pickup']

    df['fare'] = df['fare'].fillna(0)

    features = df.drop(
        ["date_time", "duration", 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', "pickup_time", "drop_time",
         "checkin_datetime", "checkout_datetime"], axis=1)

    
    if train:
        labels = (df['label'] == 'correct').astype('int')
        features = features.drop(columns=['label'])
        features = simple_imputer.fit_transform(features)
        features = robust_scaler.fit_transform(features)
        return features, labels
    else:
        features = simple_imputer.transform(features)
        features = robust_scaler.transform(features)
        return features

In [23]:
features, labels = clean_dataframe()
sub = clean_dataframe(train=False)

In [24]:
features.shape,labels.shape

((17176, 10), (17176,))

In [25]:
X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.2, shuffle=True,
                                                    stratify=labels, random_state=RANDOM_SEED)

In [26]:
len(X_train)

13740

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.2, shuffle=True,
                                                    stratify=y_train, random_state=RANDOM_SEED)

In [28]:
y_test

tripid
197286960    1
204184374    1
209714384    1
205031570    1
190273835    1
            ..
193093216    1
208887260    1
208952012    1
195814383    1
189337200    1
Name: label, Length: 2748, dtype: int32

In [29]:
len(X_train),len(X_test)

(10992, 2748)

In [30]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [31]:
trip_model = XGBClassifier()
trip_model.fit(X=X_train, y=y_train,eval_set=[(X_test,y_test)],eval_metric=f1_eval,early_stopping_rounds=10)

[0]	validation_0-error:0.05968	validation_0-f1_err:0.03223
Multiple eval metrics have been passed: 'validation_0-f1_err' will be used for early stopping.

Will train until validation_0-f1_err hasn't improved in 10 rounds.
[1]	validation_0-error:0.05895	validation_0-f1_err:0.03184
[2]	validation_0-error:0.05859	validation_0-f1_err:0.03164
[3]	validation_0-error:0.05713	validation_0-f1_err:0.03087
[4]	validation_0-error:0.05604	validation_0-f1_err:0.03029
[5]	validation_0-error:0.05640	validation_0-f1_err:0.03053
[6]	validation_0-error:0.05531	validation_0-f1_err:0.02994
[7]	validation_0-error:0.05349	validation_0-f1_err:0.02897
[8]	validation_0-error:0.05459	validation_0-f1_err:0.02957
[9]	validation_0-error:0.05240	validation_0-f1_err:0.02840
[10]	validation_0-error:0.05313	validation_0-f1_err:0.02881
[11]	validation_0-error:0.05095	validation_0-f1_err:0.02765
[12]	validation_0-error:0.05167	validation_0-f1_err:0.02806
[13]	validation_0-error:0.05058	validation_0-f1_err:0.02748
[14]	va

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)