# Logistic Regression

## Imports

In [19]:
import sys
sys.path.append('/home/apoorva/Desktop/Work/olr')

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from scripts.utils.load import load_pca_anomaly

## Loading Data

In [21]:
pca_x, olr_labels = load_pca_anomaly()



In [22]:
pca_x.shape, olr_labels.shape

((5960, 5960), (40, 135))

In [23]:
pca_x_50 = pca_x[:, :50]
pca_x_50.shape

(5960, 50)

In [24]:
def log_reg(X_train, y_train, X_test, y_test, solver='lbfgs'):
    classifier = LogisticRegression(multi_class='multinomial', random_state=1337, solver=solver, max_iter=2000)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_train)
    print("Training\n")
    print(confusion_matrix(y_pred=y_pred, y_true=y_train))
    print(classification_report(y_pred=y_pred, y_true=y_train))

    print("Testing\n")
    y_pred = classifier.predict(X_test)
    print(confusion_matrix(y_pred=y_pred, y_true=y_test))
    print(classification_report(y_pred=y_pred, y_true=y_test))

In [25]:
def pentad_data(count):
    '''
    count is 0-indexed
    count = 0 corresponds to first leading pentad
    count = 1 corresponds to second leading pentad
    count = 2 corresponds to third leading pentad
    '''
    global olr_labels, pca_x
    assert count == 0 or count == 1 or count == 2
    pca_x_50 = pca_x[:, :50]
    pca_x_50 = np.array([pca_x_50[i*40+j:i*40+j+15, :] for j in range(134 - (5*count)) for i in range(40)])
    labels = np.reshape(np.reshape(olr_labels, (40, 135))[:, 1+(5*count):], (-1))
    X_train, X_test, y_train, y_test = train_test_split(pca_x_50, labels, random_state=1337, train_size=0.875, stratify=labels)
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    return X_train, X_test, y_train, y_test

## First Pentad

In [8]:
X_train, X_test, y_train, y_test = pentad_data(0)

#### LBFGS

In [9]:
log_reg(X_train, y_train, X_test, y_test, solver='lbfgs')

Training

[[ 162  426   26]
 [  86 3229   86]
 [  36  500  139]]
              precision    recall  f1-score   support

         0.0       0.57      0.26      0.36       614
         1.0       0.78      0.95      0.85      3401
         2.0       0.55      0.21      0.30       675

    accuracy                           0.75      4690
   macro avg       0.63      0.47      0.51      4690
weighted avg       0.72      0.75      0.71      4690

Testing

[[  9  76   3]
 [ 44 401  41]
 [  9  78   9]]
              precision    recall  f1-score   support

         0.0       0.15      0.10      0.12        88
         1.0       0.72      0.83      0.77       486
         2.0       0.17      0.09      0.12        96

    accuracy                           0.63       670
   macro avg       0.35      0.34      0.34       670
weighted avg       0.57      0.63      0.59       670



#### SAGA

In [10]:
log_reg(X_train, y_train, X_test, y_test, solver='saga')

Training

[[ 233  328   53]
 [ 305 2776  320]
 [  61  386  228]]
              precision    recall  f1-score   support

         0.0       0.39      0.38      0.38       614
         1.0       0.80      0.82      0.81      3401
         2.0       0.38      0.34      0.36       675

    accuracy                           0.69      4690
   macro avg       0.52      0.51      0.52      4690
weighted avg       0.68      0.69      0.69      4690

Testing

[[ 11  64  13]
 [ 89 312  85]
 [ 17  62  17]]
              precision    recall  f1-score   support

         0.0       0.09      0.12      0.11        88
         1.0       0.71      0.64      0.68       486
         2.0       0.15      0.18      0.16        96

    accuracy                           0.51       670
   macro avg       0.32      0.31      0.31       670
weighted avg       0.55      0.51      0.53       670



#### Newton-CG

In [14]:
log_reg(X_train, y_train, X_test, y_test, solver='newton-cg')

Training

[[ 163  425   26]
 [  87 3228   86]
 [  37  499  139]]
              precision    recall  f1-score   support

         0.0       0.57      0.27      0.36       614
         1.0       0.78      0.95      0.85      3401
         2.0       0.55      0.21      0.30       675

    accuracy                           0.75      4690
   macro avg       0.63      0.47      0.51      4690
weighted avg       0.72      0.75      0.71      4690

Testing

[[  9  76   3]
 [ 44 401  41]
 [  9  78   9]]
              precision    recall  f1-score   support

         0.0       0.15      0.10      0.12        88
         1.0       0.72      0.83      0.77       486
         2.0       0.17      0.09      0.12        96

    accuracy                           0.63       670
   macro avg       0.35      0.34      0.34       670
weighted avg       0.57      0.63      0.59       670





## Second Pentad

In [9]:
X_train, X_test, y_train, y_test = pentad_data(1)

#### LBFGS

In [22]:
log_reg(X_train, y_train, X_test, y_test, solver='lbfgs')

Training

[[ 160  410   20]
 [  94 3073   83]
 [  29  509  137]]
              precision    recall  f1-score   support

         0.0       0.57      0.27      0.37       590
         1.0       0.77      0.95      0.85      3250
         2.0       0.57      0.20      0.30       675

    accuracy                           0.75      4515
   macro avg       0.64      0.47      0.50      4515
weighted avg       0.71      0.75      0.70      4515

Testing

[[  3  78   3]
 [ 47 385  33]
 [  5  85   6]]
              precision    recall  f1-score   support

         0.0       0.05      0.04      0.04        84
         1.0       0.70      0.83      0.76       465
         2.0       0.14      0.06      0.09        96

    accuracy                           0.61       645
   macro avg       0.30      0.31      0.30       645
weighted avg       0.53      0.61      0.57       645



#### SAGA

In [23]:
log_reg(X_train, y_train, X_test, y_test, solver='saga')

Training

[[ 245  299   46]
 [ 315 2590  345]
 [  55  358  262]]
              precision    recall  f1-score   support

         0.0       0.40      0.42      0.41       590
         1.0       0.80      0.80      0.80      3250
         2.0       0.40      0.39      0.39       675

    accuracy                           0.69      4515
   macro avg       0.53      0.53      0.53      4515
weighted avg       0.69      0.69      0.69      4515

Testing

[[ 12  57  15]
 [ 80 315  70]
 [ 12  71  13]]
              precision    recall  f1-score   support

         0.0       0.12      0.14      0.13        84
         1.0       0.71      0.68      0.69       465
         2.0       0.13      0.14      0.13        96

    accuracy                           0.53       645
   macro avg       0.32      0.32      0.32       645
weighted avg       0.55      0.53      0.54       645



#### Newton-CG

Skipped as first one was very time taking (~30 mins) and didn't converge. 

## Third Pentad

In [26]:
X_train, X_test, y_train, y_test = pentad_data(2)

#### LBFGS

In [27]:
log_reg(X_train, y_train, X_test, y_test, solver='lbfgs')

Training

[[ 111  420   37]
 [  70 2924  103]
 [  15  441  219]]
              precision    recall  f1-score   support

         0.0       0.57      0.20      0.29       568
         1.0       0.77      0.94      0.85      3097
         2.0       0.61      0.32      0.42       675

    accuracy                           0.75      4340
   macro avg       0.65      0.49      0.52      4340
weighted avg       0.72      0.75      0.71      4340

Testing

[[  4  65  12]
 [ 38 362  43]
 [  9  79   8]]
              precision    recall  f1-score   support

         0.0       0.08      0.05      0.06        81
         1.0       0.72      0.82      0.76       443
         2.0       0.13      0.08      0.10        96

    accuracy                           0.60       620
   macro avg       0.31      0.32      0.31       620
weighted avg       0.54      0.60      0.57       620



#### SAGA

In [28]:
log_reg(X_train, y_train, X_test, y_test, solver='saga')

Training

[[ 189  307   72]
 [ 281 2484  332]
 [  46  320  309]]
              precision    recall  f1-score   support

         0.0       0.37      0.33      0.35       568
         1.0       0.80      0.80      0.80      3097
         2.0       0.43      0.46      0.45       675

    accuracy                           0.69      4340
   macro avg       0.53      0.53      0.53      4340
weighted avg       0.69      0.69      0.69      4340

Testing

[[ 14  50  17]
 [ 65 285  93]
 [ 16  63  17]]
              precision    recall  f1-score   support

         0.0       0.15      0.17      0.16        81
         1.0       0.72      0.64      0.68       443
         2.0       0.13      0.18      0.15        96

    accuracy                           0.51       620
   macro avg       0.33      0.33      0.33       620
weighted avg       0.55      0.51      0.53       620

