# Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from utils import *

# Preprocessing

In [2]:
PATH_TRAIN = "../data/train_df.csv"
PATH_TEST  = "../data/test_df.csv"

df_train = pd.read_csv(PATH_TRAIN)
df_test  = pd.read_csv(PATH_TEST)

# Logistic Regression
## Raw data

In [3]:
# raw logreg
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)
get_metrics(clf, X_test, y_test)

nDCG score: 0.470543929175376
ROC AUC score: 0.7478457603777297
Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
# balanced class weights
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)
get_metrics(clf, X_test, y_test)

nDCG score: 0.5656151217530876
ROC AUC score: 0.7554397009639977
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.68      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.68      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.68      0.79      1529



## Drop constant features

In [5]:
features_to_drop = ["feature_0", "feature_73", "feature_74", "feature_75"]

X_train, y_train = get_training_samples(df_train, features_to_drop=features_to_drop)
X_test, y_test = get_training_samples(df_test, features_to_drop=features_to_drop)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)

get_metrics(clf,X_test, y_test)

(15081, 75) (15081,)
(1529, 75) (1529,)
nDCG score: 0.5650238240926239
ROC AUC score: 0.7552036199095022
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.69      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.69      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.69      0.79      1529



## Encode some features

In [6]:
encoder = OneHotEncoder(drop="first")

features_to_cat = df_train.columns[2:17]

X_train = df_train[features_to_cat]
y_train = df_train["target"]
X_test = df_test[features_to_cat]
y_test = df_test["target"]

encoder.fit(np.concatenate((X_train, X_test), axis=0))

X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

print(X_train_encoded.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train_encoded, y_train)

get_metrics(clf, X_test_encoded, y_test)

(15081, 69)
nDCG score: 0.47825329472489186
ROC AUC score: 0.5668207751327956
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.71      0.83      1495
           1       0.03      0.44      0.06        34

    accuracy                           0.71      1529
   macro avg       0.51      0.58      0.45      1529
weighted avg       0.96      0.71      0.81      1529





## Encode categorical features (int) and scale numerical (float)

In [7]:
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


numerical_columns_selector = selector(dtype_include=float)
categorical_columns_selector = selector(dtype_include=int)

numerical_columns = numerical_columns_selector(df_train[df_train.columns[1:-1]])
categorical_columns = categorical_columns_selector(df_train[df_train.columns[1:-1]])

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

model = make_pipeline(preprocessor, LogisticRegression(max_iter=2000, class_weight="balanced"))
model

model.fit(df_train[df_train.columns[1:-1]], df_train[df_train.columns[-1]])


In [10]:
get_metrics(model, df_test[df_train.columns[1:-1]], df_test[df_train.columns[-1]])


nDCG score: 0.5632932692123284
ROC AUC score: 0.6749163879598662
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.76      0.86      1495
           1       0.04      0.47      0.08        34

    accuracy                           0.75      1529
   macro avg       0.51      0.61      0.47      1529
weighted avg       0.96      0.75      0.84      1529

