# Predicting antibody-antigen interactions with Transformer-based machine learning
### Part 3b: Logistic Regression
This workbook contains all of the code for Logistic Regression, which is used to compare against our transformer model

*Special Thanks to Teng Ann's work which this code is based off*

### Step 0: Imports

In [3]:
import os
from pathlib import Path
import numpy as np
from numpy import arange, logspace
import pandas as pd

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
from sklearn.linear_model import LogisticRegression

### Step 1: Logistic Regression Function

In [12]:
def run_LR(X_train, X_test, y_train, y_test, test_name, pred_export_name):
    # Logistic Regression
    LR = LogisticRegression(random_state = 1001, max_iter=1000)

    # Model fitting and training
    LR.fit(X_train, y_train)

    # Make predictions for validation data
    y_pred = LR.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # Evaluate predictions: Accuracy score
    accuracy = accuracy_score(y_test, predictions) * 100 # y_validate must match exactly with predictions in order to score accuracy points
    print(">>>Test result: "+ test_name)
    print("Accuracy: %.2f%%" % (accuracy))
    
    # Save predictions
    prediction_df = pd.DataFrame({'pred_label': np.asarray(predictions), 'true_label': y_test}, columns=['pred_label', 'true_label'])
    prediction_df.to_csv(pred_export_name, index=False)
    

### Step 2: Perform Logistic Regression and save the predictions

In [13]:
# Imbalanced (Full Sequence)
X_train = np.load("/kaggle/input/1024-dataset-featurized/XMatrix(train).npy",allow_pickle=True)
X_test = np.load("/kaggle/input/1024-dataset-featurized/XMatrix(test).npy",allow_pickle=True)
y_train = np.load("/kaggle/input/1024-dataset-featurized/YMatrix(train).npy",allow_pickle=True)
y_test = np.load("/kaggle/input/1024-dataset-featurized/YMatrix(test).npy",allow_pickle=True)
run_LR(X_train, X_test, y_train, y_test, "Imbalanced (Full Sequence)", "LR_pred_imbalanced_fullseq.csv")

# Imbalanced (CDR3 Sequence)
X_train = np.load("/kaggle/input/1024-dataset-featurized/XMatrixCDR3(train).npy",allow_pickle=True)
X_test = np.load("/kaggle/input/1024-dataset-featurized/XMatrixCDR3(test).npy",allow_pickle=True)
y_train = np.load("/kaggle/input/1024-dataset-featurized/YMatrix(train).npy",allow_pickle=True)
y_test = np.load("/kaggle/input/1024-dataset-featurized/YMatrix(test).npy",allow_pickle=True)
run_LR(X_train, X_test, y_train, y_test, "Imbalanced (CDR3)", "LR_pred_imbalanced_cdr3.csv")

# Balanced (Full Sequence)
X_train = np.load("/kaggle/input/1024-dataset-featurized/XMatrix_balanced(train).npy",allow_pickle=True)
X_test = np.load("/kaggle/input/1024-dataset-featurized/XMatrix_balanced(test).npy",allow_pickle=True)
y_train = np.load("/kaggle/input/1024-dataset-featurized/YMatrix_balanced(train).npy",allow_pickle=True)
y_test = np.load("/kaggle/input/1024-dataset-featurized/YMatrix_balanced(test).npy",allow_pickle=True)
run_LR(X_train, X_test, y_train, y_test, "Balanced (Full Sequence)", "LR_pred_balanced_fullseq.csv")

# Balanced (CDR3 Sequence)
X_train = np.load("/kaggle/input/1024-dataset-featurized/XMatrixCDR3_balanced(train).npy",allow_pickle=True)
X_test = np.load("/kaggle/input/1024-dataset-featurized/XMatrixCDR3_balanced(test).npy",allow_pickle=True)
y_train = np.load("/kaggle/input/1024-dataset-featurized/YMatrix_balanced(train).npy",allow_pickle=True)
y_test = np.load("/kaggle/input/1024-dataset-featurized/YMatrix_balanced(test).npy",allow_pickle=True)
run_LR(X_train, X_test, y_train, y_test, "Balanced (CDR3)", "LR_pred_balanced_cdr3.csv")



>>>Test result: Imbalanced (Full Sequence)
Accuracy: 60.20%
>>>Test result: Imbalanced (CDR3)
Accuracy: 59.77%
>>>Test result: Balanced (Full Sequence)
Accuracy: 44.50%
>>>Test result: Balanced (CDR3)
Accuracy: 44.74%
