In [43]:
%%capture

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import ndcg_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

from catboost import CatBoostClassifier

In [58]:
df_train = pd.read_csv("/kaggle/input/vk-contest-mle/train_df.csv")
df_test = pd.read_csv("/kaggle/input/vk-contest-mle/test_df.csv")

X_train = df_train[df_train.columns[1:-1]].to_numpy()
X_test = df_test[df_test.columns[1:-1]].to_numpy()

y_train = df_train["target"].to_numpy()
y_test = df_test["target"].to_numpy()

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(15081, 79) (15081,) (1529, 79) (1529,)


In [79]:
import statsmodels.api as sm

sm_model = sm.Logit(y_train, X_train[:, :-6]).fit(disp=0)

In [80]:
print(sm_model.summary2())

                           Results: Logit
Model:               Logit             Method:            MLE       
Dependent Variable:  y                 Pseudo R-squared:  0.089     
Date:                2024-03-11 13:59  AIC:               2980.4662 
No. Observations:    15081             BIC:               3529.1919 
Df Model:            71                Log-Likelihood:    -1418.2   
Df Residuals:        15009             LL-Null:           -1557.2   
Converged:           1.0000            LLR p-value:       2.9810e-26
No. Iterations:      9.0000            Scale:             1.0000    
--------------------------------------------------------------------
       Coef.    Std.Err.      z    P>|z|      [0.025       0.975]   
--------------------------------------------------------------------
const -0.4504  238498.2329 -0.0000 1.0000  -467448.3973  467447.4965
x1    -0.2170       0.1145 -1.8956 0.0580       -0.4413       0.0074
x2    -0.0180       0.0570 -0.3152 0.7526       -0.1298      

# Log Reg
## Default Log Reg

In [39]:
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.4735018879797566
ROC AUC score: 0.7487113909108793
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Balanced classes

In [19]:
clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"NDCG score: {ndcg_score([y_test], [y_pred])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5650785173633501
NDCG score: 0.442441208390112
ROC AUC score: 0.7557348022821168
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.68      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.68      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.68      0.79      1529



## SMOTE

In [85]:
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5809900720969906
ROC AUC score: 0.7676372221129255
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.69      0.81      1495
           1       0.05      0.65      0.08        34

    accuracy                           0.69      1529
   macro avg       0.52      0.67      0.45      1529
weighted avg       0.97      0.69      0.80      1529



## Scaler

In [84]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5817274273783811
ROC AUC score: 0.7664371434192405
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5772164847823201
ROC AUC score: 0.7486720440684635
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.69      0.82      1495
           1       0.05      0.68      0.09        34

    accuracy                           0.69      1529
   macro avg       0.52      0.69      0.45      1529
weighted avg       0.97      0.69      0.80      1529



In [36]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5449158334966708
ROC AUC score: 0.766692897894944
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.5745598128654203
ROC AUC score: 0.7477670666928979
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.69      0.81      1495
           1       0.05      0.68      0.09        34

    accuracy                           0.69      1529
   macro avg       0.52      0.68      0.45      1529
weighted avg       0.97      0.69      0.80      1529



## Scaler + SMOTE

In [42]:
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train_scaled, y_train_resampled)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.587600637820875
ROC AUC score: 0.7582726736179422
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.70      0.82      1495
           1       0.05      0.65      0.09        34

    accuracy                           0.70      1529
   macro avg       0.52      0.67      0.45      1529
weighted avg       0.97      0.70      0.80      1529



# Cat Boost

In [48]:
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

clf = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.001, 
#     loss_function='CrossEntropy'
    custom_loss = ["AUC"]
)

clf.fit(
    X_tra, y_tra,
    eval_set=(X_val, y_val),
)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

NDCG score: 0.44168844506101995
ROC AUC score: 0.657957898878615
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_tra, X_val, y_tra, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

clf = CatBoostClassifier(
    iterations=1000, 
    learning_rate=0.001, 
#     loss_function='CrossEntropy'
    custom_loss = ["AUC"]
)

clf.fit(
    X_tra, y_tra,
    eval_set=(X_val, y_val),
    verbose=True,
    plot=True
)

y_pred = clf.predict(X_test_scaled)
y_pred_proba = clf.predict_proba(X_test_scaled)

print(f"NDCG score: {ndcg_score([y_test], [y_pred_proba[:, 1]])}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_pred_proba[:, 1])}")
print(f"Classification report:\n{classification_report(y_true=y_test, y_pred=y_pred)}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

NDCG score: 0.441685190423594
ROC AUC score: 0.657938225457407
Classification report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
df_train_new = df_train.copy()
df_test_new = df_test.copy()

In [56]:
for c in df_train_new.columns:
    print(f"{c} : {df_train_new[c].nunique()} : {df_train_new[c].unique()}")

search_id : 1000 : [   758   1569   1899   2545   2863   3095  10639  10646  14061  16711
  17398  21948  24131  24317  24527  26604  27781  29269  31539  32966
  33512  34666  42551  44019  45262  45360  53716  56886  58362  58714
  63435  63754  65894  66898  67623  68329  68403  69053  70958  72079
  73015  74547  76009  76213  79420  81312  83344  89755  89758  90589
  95884  95981  96463  97872  98243  98768 100683 100719 101828 103411
 104266 104317 105302 105375 107106 108424 110120 112973 115697 116598
 118694 119013 121898 122932 123333 124832 127026 127688 131393 133470
 133775 133968 134343 134484 134823 136805 137235 138956 144485 145902
 147165 148574 149589 150727 150865 153284 153351 153533 154838 155433
 157294 158290 158976 160745 163100 163940 164747 164784 172858 173725
 174685 178343 178474 179255 181903 186228 186794 186932 187763 188472
 188633 190518 194373 196472 201445 202237 203319 204055 206739 207078
 211693 213609 214089 214810 216385 217411 218302 218555 2

In [None]:
df_train_new.drop()