# Import

In [29]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from utils import *

# Preprocessing

In [30]:
PATH_TRAIN = "../data/train_df.csv"
PATH_TEST  = "../data/test_df.csv"

df_train = pd.read_csv(PATH_TRAIN)
df_test  = pd.read_csv(PATH_TEST)

# LogReg

## Raw data

In [31]:
# raw logreg
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)
get_metrics(clf, X_test, y_test)

nDCG score: 0.470543929175376
ROC AUC score: 0.7478457603777297
Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
# balanced class weights
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)
get_metrics(clf, X_test, y_test)

nDCG score: 0.5656151217530876
ROC AUC score: 0.7554397009639977
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.68      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.68      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.68      0.79      1529



## Reduced datasets

### Drop constant features

In [33]:
features_to_drop = ["feature_0", "feature_73", "feature_74", "feature_75"]

X_train, y_train = get_training_samples(df_train, features_to_drop=features_to_drop)
X_test, y_test = get_training_samples(df_test, features_to_drop=features_to_drop)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)

get_metrics(clf,X_test, y_test)

(15081, 75) (15081,)
(1529, 75) (1529,)
nDCG score: 0.5650238240926239
ROC AUC score: 0.7552036199095022
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.69      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.69      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.69      0.79      1529



### Encode some features

In [46]:
encoder = OneHotEncoder(drop="first")

features_to_cat = df_train.columns[2:17]

X_train = df_train[features_to_cat]
y_train = df_train["target"]
X_test = df_test[features_to_cat]
y_test = df_test["target"]

encoder.fit(np.concatenate((X_train, X_test), axis=0))

X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

print(X_train_encoded.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_encoded, y_train)

get_metrics(clf, X_test_encoded, y_test)

(15081, 69)
nDCG score: 0.47825329472489186
ROC AUC score: 0.5668207751327956
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.71      0.83      1495
           1       0.03      0.44      0.06        34

    accuracy                           0.71      1529
   macro avg       0.51      0.58      0.45      1529
weighted avg       0.96      0.71      0.81      1529





In [47]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15081 entries, 0 to 15080
Data columns (total 81 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   search_id   15081 non-null  int64  
 1   feature_0   15081 non-null  int64  
 2   feature_1   15081 non-null  int64  
 3   feature_2   15081 non-null  int64  
 4   feature_3   15081 non-null  int64  
 5   feature_4   15081 non-null  int64  
 6   feature_5   15081 non-null  int64  
 7   feature_6   15081 non-null  int64  
 8   feature_7   15081 non-null  int64  
 9   feature_8   15081 non-null  int64  
 10  feature_9   15081 non-null  int64  
 11  feature_10  15081 non-null  int64  
 12  feature_11  15081 non-null  int64  
 13  feature_12  15081 non-null  int64  
 14  feature_13  15081 non-null  int64  
 15  feature_14  15081 non-null  int64  
 16  feature_15  15081 non-null  int64  
 17  feature_16  15081 non-null  float64
 18  feature_17  15081 non-null  float64
 19  feature_18  15081 non-nul

In [56]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_include=float)
categorical_columns_selector = selector(dtype_include=int)

numerical_columns = numerical_columns_selector(df_train[df_train.columns[1:-1]])
categorical_columns = categorical_columns_selector(df_train[df_train.columns[1:-1]])

from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=2000, class_weight="balanced"))
model

model.fit(df_train[df_train.columns[1:-1]], df_train[df_train.columns[-1]])


In [59]:
model.predict(df_test[df_train.columns[1:-1]])

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [63]:
get_metrics(model, df_test[df_train.columns[1:-1]], df_test[df_train.columns[-1]])


nDCG score: 0.5632932692123284
ROC AUC score: 0.6749163879598662
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.76      0.86      1495
           1       0.04      0.47      0.08        34

    accuracy                           0.75      1529
   macro avg       0.51      0.61      0.47      1529
weighted avg       0.96      0.75      0.84      1529

