# Logistic Regression

## Imports

In [1]:
import sys
sys.path.append('/home/apoorva/Desktop/Work/olr')

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import f1_score
from scripts.utils.load import load_pca_anomaly

## Loading Data

In [3]:
pca_x, olr_labels = load_pca_anomaly()
np.save('../../datasets/pca_x', pca_x)

In [4]:
pca_x.shape, olr_labels.shape

((5920, 5920), (40, 134))

In [5]:
pca_x_50 = pca_x[:, :50]
pca_x_50.shape

(5920, 50)

In [None]:
def log_reg(X_train, y_train, X_test, y_test):
    def objective(trial):
        """Define the objective function"""

        params = {
            'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'saga']),
            'penalty': trial.suggest_categorical('penalty', [None, 'l1', 'l2', 'elasticnet']),
            'C': trial.suggest_loguniform('C', 1e-2, 1e2),
            'l1_ratio': trial.suggest_uniform('l1_ratio', 0, 1.0)
        }

        if (params['solver'] == 'newton-cg' or params['solver'] == 'lbfgs') and (
            params['penalty'] == 'l1' or params['penalty'] == 'elasticnet'):
            return 0

        # Fit the model
        optuna_model = LogisticRegression(**params)
        optuna_model.fit(X_train, y_train)

        # Make predictions
        y_pred = optuna_model.predict(X_test)

        # Evaluate predictions
        accuracy = f1_score(y_test, y_pred, average='macro')
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50, n_jobs=-1)
    print('Number of finished trials: {}'.format(len(study.trials)))
    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))
    print('  Params: ')

    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

    params = trial.params
    classifier = LogisticRegression(**params)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_train)
    print("Training\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_train))
    print(classification_report(y_pred=y_pred, y_true=y_train))

    print("Testing\n")
    y_pred = classifier.predict(X_test)
    print(confusion_matrix(y_pred=y_pred, y_true=y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))

In [None]:
def pentad_data(count):
    '''
    count is 0-indexed
    count = 0 corresponds to first leading pentad
    count = 1 corresponds to second leading pentad
    count = 2 corresponds to third leading pentad
    '''
    global olr_labels, pca_x
    assert count == 0 or count == 1 or count == 2
    pca_x_50 = pca_x[:, :50]
    pca_x_50 = np.array([pca_x_50[i*40+j:i*40+j+15, :] for i in range(40) for j in range(134 - (5*count))])
    labels = np.reshape(np.reshape(olr_labels, (40, 134))[:, (5*count):], (-1))
    X_train, X_test, y_train, y_test = train_test_split(pca_x_50, labels, random_state=1337, train_size=0.875, stratify=labels)
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    return X_train, X_test, y_train, y_test

## First Pentad

In [42]:
X_train, X_test, y_train, y_test = pentad_data(0)

In [44]:
log_reg(X_train, y_train, X_test, y_test)

[I 2024-05-25 02:06:57,821] A new study created in memory with name: no-name-9e36e937-b5c3-43ff-ae3e-56ecc94a5a16
[I 2024-05-25 02:07:08,087] Trial 0 finished with value: 0.36453468041839643 and parameters: {'solver': 'saga', 'penalty': 'elasticnet', 'C': 0.03154594406020433, 'l1_ratio': 0.3900815899612645}. Best is trial 0 with value: 0.36453468041839643.
[I 2024-05-25 02:08:00,648] Trial 1 finished with value: 0.37509011232257966 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.012684310063229629, 'l1_ratio': 0.6033460349150164}. Best is trial 1 with value: 0.37509011232257966.
[I 2024-05-25 02:08:01,215] Trial 2 finished with value: 0.36492031789398943 and parameters: {'solver': 'lbfgs', 'penalty': None, 'C': 0.23776959225495936, 'l1_ratio': 0.8731210305385908}. Best is trial 1 with value: 0.37509011232257966.
[I 2024-05-25 02:08:01,218] Trial 3 finished with value: 0.0 and parameters: {'solver': 'lbfgs', 'penalty': 'l1', 'C': 99.29118478987908, 'l1_ratio': 0.69101504

Number of finished trials: 50
Best trial:
  Value: 0.37509011232257966
  Params: 
    solver: newton-cg
    penalty: l2
    C: 0.012684310063229629
    l1_ratio: 0.6033460349150164
Training

[[ 184  408   22]
 [  97 3214   90]
 [  29  496  150]]
              precision    recall  f1-score   support

         0.0       0.59      0.30      0.40       614
         1.0       0.78      0.95      0.85      3401
         2.0       0.57      0.22      0.32       675

    accuracy                           0.76      4690
   macro avg       0.65      0.49      0.52      4690
weighted avg       0.73      0.76      0.72      4690

Testing

[[ 16  66   6]
 [ 40 406  40]
 [  9  77  10]]
              precision    recall  f1-score   support

         0.0       0.25      0.18      0.21        88
         1.0       0.74      0.84      0.78       486
         2.0       0.18      0.10      0.13        96

    accuracy                           0.64       670
   macro avg       0.39      0.37      0.38   

## Second Pentad

In [45]:
X_train, X_test, y_train, y_test = pentad_data(1)

In [46]:
log_reg(X_train, y_train, X_test, y_test)

[I 2024-05-25 02:37:06,912] A new study created in memory with name: no-name-31527724-6874-4343-b812-990a9319b69c
[I 2024-05-25 02:37:07,456] Trial 0 finished with value: 0.3720474300831443 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 10.894072999346664, 'l1_ratio': 0.3765257253537706}. Best is trial 0 with value: 0.3720474300831443.
[I 2024-05-25 02:37:07,458] Trial 1 finished with value: 0.0 and parameters: {'solver': 'newton-cg', 'penalty': 'l1', 'C': 0.5836151466245671, 'l1_ratio': 0.9672368908032913}. Best is trial 0 with value: 0.3720474300831443.
[I 2024-05-25 02:37:11,784] Trial 2 finished with value: 0.37370882603619804 and parameters: {'solver': 'saga', 'penalty': None, 'C': 0.8738566576064879, 'l1_ratio': 0.4647559102289136}. Best is trial 2 with value: 0.37370882603619804.
[I 2024-05-25 02:37:11,787] Trial 3 finished with value: 0.0 and parameters: {'solver': 'lbfgs', 'penalty': 'elasticnet', 'C': 30.372906569160246, 'l1_ratio': 0.05308316583313777}. Best is tr

Number of finished trials: 50
Best trial:
  Value: 0.3799217934796601
  Params: 
    solver: saga
    penalty: l1
    C: 0.010604714105066899
    l1_ratio: 0.6800221709348485
Training

[[ 258  291   41]
 [ 331 2626  293]
 [  61  355  259]]
              precision    recall  f1-score   support

         0.0       0.40      0.44      0.42       590
         1.0       0.80      0.81      0.81      3250
         2.0       0.44      0.38      0.41       675

    accuracy                           0.70      4515
   macro avg       0.55      0.54      0.54      4515
weighted avg       0.69      0.70      0.70      4515

Testing

[[ 22  44  18]
 [ 70 319  76]
 [ 12  64  20]]
              precision    recall  f1-score   support

         0.0       0.21      0.26      0.23        84
         1.0       0.75      0.69      0.72       465
         2.0       0.18      0.21      0.19        96

    accuracy                           0.56       645
   macro avg       0.38      0.39      0.38       64

## Third Pentad

In [47]:
X_train, X_test, y_train, y_test = pentad_data(2)
log_reg(X_train, y_train, X_test, y_test)

[I 2024-05-25 02:44:19,058] A new study created in memory with name: no-name-475d1462-04ad-4dfa-bfe2-48b5b7fe7935
[I 2024-05-25 02:45:08,365] Trial 0 finished with value: 0.37453637979953763 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 10.01237770997742, 'l1_ratio': 0.61418437404981}. Best is trial 0 with value: 0.37453637979953763.
[I 2024-05-25 02:45:08,881] Trial 1 finished with value: 0.3560166324361915 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.0372290479477451, 'l1_ratio': 0.1040555387272406}. Best is trial 0 with value: 0.37453637979953763.
[I 2024-05-25 02:45:18,356] Trial 2 finished with value: 0.3682848474948555 and parameters: {'solver': 'saga', 'penalty': 'elasticnet', 'C': 10.759601690496519, 'l1_ratio': 0.5849265151363039}. Best is trial 0 with value: 0.37453637979953763.
[I 2024-05-25 02:46:11,123] Trial 3 finished with value: 0.3735834407110054 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.7659299961514403, 'l1_ratio':

Number of finished trials: 50
Best trial:
  Value: 0.3813121416379219
  Params: 
    solver: newton-cg
    penalty: None
    C: 20.032631178427625
    l1_ratio: 0.548458785807949
Training

[[ 166  364   38]
 [  84 2915   98]
 [  27  474  174]]
              precision    recall  f1-score   support

         0.0       0.60      0.29      0.39       568
         1.0       0.78      0.94      0.85      3097
         2.0       0.56      0.26      0.35       675

    accuracy                           0.75      4340
   macro avg       0.65      0.50      0.53      4340
weighted avg       0.72      0.75      0.71      4340

Testing

[[ 17  61   3]
 [ 39 361  43]
 [  9  76  11]]
              precision    recall  f1-score   support

         0.0       0.26      0.21      0.23        81
         1.0       0.72      0.81      0.77       443
         2.0       0.19      0.11      0.14        96

    accuracy                           0.63       620
   macro avg       0.39      0.38      0.38     