In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.metrics import log_loss, average_precision_score

In [35]:
import pickle
def load_model(filepath):
    with open(filepath, "rb") as file:
        return pickle.load(file)
    
def report_metrics(model, X_test, y_test):
    y_pred = model.predict_proba(X_test)[:, 1]
    pr_auc = average_precision_score(y_score=y_pred, y_true=y_test)
    logloss = log_loss(y_pred=y_pred, y_true=y_test)
    print(f'PR-AUC: {pr_auc}')
    print(f'Log loss: {logloss}')


def save_model(model, filename: str, save_to: str = '../models/') -> None:
    with open(save_to + filename, "wb") as file:
        pickle.dump(model, file)

    print(f"Model saved to {save_to + filename}")


numeric_features = ['lead_response_time', 'name_length', 'model_description_length']
categorical_features = ['leadtype', 'seek', 'engine_size', 'transmission', 'body_type', 'variant', 'drive_type','make', 'city', 'group', 'is_motus_group']
ordinal_features = ['time_of_day', 'month_period',]
passthrough_features = ['customer_leads_count', 'is_weekend_lead', 'is_full_name', 'is_valid_name', 
                        'is_promotional', 'is_valid_cell', 'is_personal_cell', 'is_personal_email', 'is_valid_email', 'is_email_provided']


# 1. CatBoost

In [7]:
from catboost import CatBoostClassifier

In [16]:
# load data
train_df = pd.read_csv('../data/clean_train.csv')
X = train_df[numeric_features + ordinal_features + passthrough_features + categorical_features].copy()
y = train_df['vehiclesold']
X.head()

Unnamed: 0,lead_response_time,name_length,model_description_length,time_of_day,month_period,customer_leads_count,is_weekend_lead,is_full_name,is_valid_name,is_promotional,...,seek,engine_size,transmission,body_type,variant,drive_type,make,city,group,is_motus_group
0,45,20,6,afternoon,Mid Month,2,1,1,1,0,...,New,1.5,Unspecified,Unspeficied,pro,Unspeficied,other,Germiston,other,Yes
1,0,12,1,afternoon,Mid Month,1,1,1,1,0,...,New,Unspeficied,Unspecified,Unspeficied,Unspeficied,Unspeficied,renault,Northcliff,Renault,No
2,42,12,1,afternoon,Mid Month,1,1,1,1,0,...,New,Unspeficied,Unspecified,Unspeficied,Unspeficied,Unspeficied,kia,Durban,General,No
3,0,18,2,afternoon,Mid Month,1,1,1,0,0,...,New,Unspeficied,Unspecified,Unspeficied,Unspeficied,Unspeficied,volkswagen,Germiston,VW,No
4,0,24,1,afternoon,Mid Month,1,1,1,0,0,...,New,Unspeficied,Unspecified,Unspeficied,Unspeficied,Unspeficied,renault,Durban,General,No


In [17]:
cat_features = categorical_features + ordinal_features

for cat_feature in cat_features:
    X[cat_feature] = X[cat_feature].astype(str)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [31]:
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='Logloss',
    scale_pos_weight=20,  
    cat_features=cat_features,
    verbose=100,
    random_state=42
)

cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

0:	learn: 0.6905972	test: 0.6908784	best: 0.6908784 (0)	total: 113ms	remaining: 1m 53s
100:	learn: 0.6325527	test: 0.6505868	best: 0.6504527 (97)	total: 10.5s	remaining: 1m 33s
200:	learn: 0.6171086	test: 0.6491527	best: 0.6489360 (167)	total: 22.8s	remaining: 1m 30s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6478284861
bestIteration = 245

Shrink model to first 246 iterations.


<catboost.core.CatBoostClassifier at 0x1a31eb220a0>

In [32]:
y_proba = cat_model.predict_proba(X_val)[:, 1]
print("PR-AUC:", average_precision_score(y_val, y_proba))
print("Log Loss:", log_loss(y_val, y_proba))

PR-AUC: 0.08226036959740747
Log Loss: 0.5889968609629337


In [33]:
report_metrics(model=cat_model, y_test=y_val, X_test=X_val)

PR-AUC: 0.08226036959740747
Log loss: 0.5889968609629337


In [36]:
save_model(cat_model, 'cat_model.pkl')

Model saved to ../models/cat_model.pkl


# 2. LightGBM


In [44]:
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier

In [None]:
# preprocessor = load_model('../models/preprocessor.pkl')
# X_transformed = load_model('../models/X_transformed.pkl')

# X_train_trans, X_val_trans, y_train, y_val = train_test_split(
#     X_transformed, y, test_size=0.2, stratify=y, random_state=42
# )

In [48]:
X_lgb = X.copy()

encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X_lgb[col] = le.fit_transform(X_lgb[col])
    encoders[col] = le


X_train, X_val, y_train, y_val = train_test_split(
    X_lgb, y, test_size=0.2, stratify=y, random_state=42
)

In [52]:
lgbm_model = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    scale_pos_weight=20,  
    learning_rate=0.01,
    n_estimators=1500,
    early_stopping_round=50,
    verbosity=-1,
    random_state=42
)


lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='logloss',
)


In [53]:
y_proba = lgbm_model.predict_proba(X_val)[:, 1]
print("PR-AUC:", average_precision_score(y_val, y_proba))
print("Log Loss:", log_loss(y_val, y_proba))

PR-AUC: 0.0682036432994026
Log Loss: 0.18005071476635748


In [50]:
y_proba = lgbm_model.predict_proba(X_val)[:, 1]
print("PR-AUC:", average_precision_score(y_val, y_proba))
print("Log Loss:", log_loss(y_val, y_proba))

PR-AUC: 0.06307507540237606
Log Loss: 0.18333884078438087


In [54]:
save_model(lgbm_model, 'lightgbm.pkl')

Model saved to ../models/lightgbm.pkl


# 3. RandomForestClassifier

In [57]:
preprocessor = load_model('../models/preprocessor.pkl')
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

In [58]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=.2, random_state=42)

In [59]:
rfc_01 = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(class_weight='balanced', criterion='log_loss'))
])

rfc_01.fit(X_train, y_train)

In [60]:
report_metrics(rfc_01, X_val, y_val)

PR-AUC: 0.060671371287285086
Log loss: 0.4579730409403172


In [62]:
extra_trees_model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', ExtraTreesClassifier(class_weight='balanced', criterion='log_loss'))
])

extra_trees_model.fit(X_train, y_train)

In [63]:
report_metrics(extra_trees_model, X_val, y_val)

PR-AUC: 0.057091357894216356
Log loss: 0.8200673093691315


# XGBoostClassifier

In [64]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [65]:
xgb_pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(random_state=42))
])

In [66]:
xgb_pipe.fit(X_train, y_train)

In [67]:
report_metrics(xgb_pipe, X_val, y_val)

PR-AUC: 0.07029863844054332
Log loss: 0.181280504473307


In [68]:
xgb_pipe2 = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(random_state=42))
])

search_space = {
    'clf__max_depth': Integer(5, 10),
    'clf__learning_rate': Real(.001, 1., prior='log-uniform'),
    'clf__subsample': Real(.5,1.),
    'clf__colsample_bytree': Real(.5, 1.),
    'clf__colsample_bylevel': Real(.5, 1.),
    'clf__colsample_bynode': Real(.5, 1.),
    'clf__reg_alpha': Real(0., 10.),
    'clf__reg_lambda': Real(0., 10.),
    'clf__gamma': Real(0., 10.)
}

opt = BayesSearchCV(xgb_pipe2, search_spaces=search_space, cv=10, n_iter=50, scoring='roc_auc', random_state=42)

In [69]:
opt.fit(X_train, y_train)

In [70]:
report_metrics(opt.best_estimator_, X_val, y_val)

PR-AUC: 0.08640273487515238
Log loss: 0.17325896417078945


In [71]:
opt.best_estimator_

In [72]:
save_model(opt.best_estimator_, 'xgb_clf.pkl')

Model saved to ../models/xgb_clf.pkl


# Prepare submission file

In [73]:
test_df = pd.read_csv('../data/clean_test.csv')
test_df.head()

Unnamed: 0.1,Unnamed: 0,customerid,leadid,leadtype,seek,is_email_provided,is_valid_email,is_personal_email,is_personal_cell,is_valid_cell,...,is_motus_group,name_length,is_valid_name,is_full_name,is_weekend_lead,lead_response_time,customer_leads_count,make,city,group
0,0,4A689672-0844-464F-BA73-EEDF4F4832F4,11187548,Other,New,1,1,1,1,1,...,No,16,1,1,1,45,1,kia,other,other
1,1,2C961B46-7B19-4643-90EF-25208500FDCC,11187684,Facebook,New,1,1,1,1,1,...,No,13,1,1,1,0,1,renault,other,Renault
2,2,596E3072-1C63-473A-99E8-23D01272BD40,11187917,Facebook,New,1,1,1,1,1,...,Yes,6,1,0,1,0,1,other,other,other
3,3,824965E7-A7DA-4C45-A754-D685C58C20F9,11187948,AutoTrader,Used,1,1,1,1,1,...,No,13,1,1,1,42,1,volkswagen,other,VW
4,4,530AF4BD-9080-4E63-A175-30E648715D08,11188151,Other,Used,1,1,1,1,1,...,No,21,1,1,1,0,1,other,other,General


In [74]:
opt.best_estimator_.predict_proba(test_df)[:, 1]

array([0.03374643, 0.03655069, 0.01229892, ..., 0.01262057, 0.03427181,
       0.13559152], dtype=float32)

In [77]:
def save_submission(model):
    cols = ['LeadID','VehicleSoldProbability']
    test_df['VehicleSoldProbability'] = model.predict_proba(test_df)[:, 1]
    submission_df = test_df[['leadid', 'VehicleSoldProbability']]
    submission_df.columns = cols
    submission_df.to_csv('submission.csv', index=False, header=True)
    print("Submission file saved!")

In [78]:
save_submission(opt.best_estimator_)

Submission file saved!
