# Import

In [8]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from utils import *

# Preprocessing

In [9]:
PATH_TRAIN = "../data/train_df.csv"
PATH_TEST  = "../data/test_df.csv"

df_train = pd.read_csv(PATH_TRAIN)
df_test  = pd.read_csv(PATH_TEST)

# LogReg

## Raw data

In [5]:
# raw logreg
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)
get_metrics(clf, y_test, X_test)

nDCG score: 0.470543929175376
ROC AUC score: 0.7478457603777297
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1495
           1       0.00      0.00      0.00        34

    accuracy                           0.98      1529
   macro avg       0.49      0.50      0.49      1529
weighted avg       0.96      0.98      0.97      1529



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# balanced class weights
X_train, y_train = get_training_samples(df_train)
X_test, y_test = get_training_samples(df_test)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)
get_metrics(clf, y_test, X_test)

nDCG score: 0.5656151217530876
ROC AUC score: 0.7554397009639977
              precision    recall  f1-score   support

           0       0.99      0.68      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.68      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.68      0.79      1529



## Reduced datasets

### Drop constant features

In [73]:
features_to_drop = ["feature_0", "feature_73", "feature_74", "feature_75"]

X_train, y_train = get_training_samples(df_train, features_to_drop=features_to_drop)
X_test, y_test = get_training_samples(df_test, features_to_drop=features_to_drop)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)

get_metrics(clf, y_test, X_test)

(15081, 75) (15081,)
(1529, 75) (1529,)
nDCG score: 0.5650238240926239
ROC AUC score: 0.7552036199095022
              precision    recall  f1-score   support

           0       0.99      0.69      0.81      1495
           1       0.05      0.71      0.09        34

    accuracy                           0.69      1529
   macro avg       0.52      0.70      0.45      1529
weighted avg       0.97      0.69      0.79      1529



### Drop correlated features

In [83]:
full_dataset = pd.concat([df_train, df_test])

corr_mtx = full_dataset.corr()

mask = corr_mtx.abs() > 0.5 
pairs = mask[mask == True].stack().index
pairs = list(set((i, j) for i, j in pairs))
pairs = [(item1, item2) for item1, item2 in pairs if item1 != item2]
pairs

[('feature_52', 'feature_62'),
 ('feature_54', 'feature_53'),
 ('feature_41', 'feature_42'),
 ('feature_65', 'feature_41'),
 ('feature_71', 'feature_72'),
 ('feature_41', 'feature_64'),
 ('feature_11', 'feature_12'),
 ('feature_78', 'feature_77'),
 ('feature_31', 'feature_32'),
 ('feature_32', 'feature_30'),
 ('feature_41', 'feature_63'),
 ('feature_38', 'feature_35'),
 ('feature_55', 'feature_53'),
 ('feature_76', 'feature_41'),
 ('feature_4', 'feature_3'),
 ('feature_48', 'feature_42'),
 ('feature_50', 'feature_71'),
 ('feature_65', 'feature_54'),
 ('feature_49', 'feature_46'),
 ('feature_76', 'feature_50'),
 ('feature_48', 'feature_64'),
 ('feature_42', 'feature_41'),
 ('feature_41', 'feature_39'),
 ('feature_62', 'feature_42'),
 ('feature_65', 'feature_62'),
 ('feature_53', 'feature_41'),
 ('feature_63', 'feature_53'),
 ('feature_54', 'feature_65'),
 ('feature_41', 'feature_76'),
 ('feature_11', 'feature_59'),
 ('feature_30', 'feature_32'),
 ('feature_42', 'feature_48'),
 ('feature

In [None]:
features_to_drop = ["feature_0", "feature_73", "feature_74", "feature_75"]

X_train, y_train = get_training_samples(df_train, features_to_drop=features_to_drop)
X_test, y_test = get_training_samples(df_test, features_to_drop=features_to_drop)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_train, y_train)

get_metrics(clf, y_test, X_test)