In [13]:
import pandas as pd
import numpy as np

## Load the dataframes

In [15]:
interaction_df = pd.read_csv("dataframes/interaction_data.csv")
filtered_df = pd.read_csv('dataframes/filtered_data.csv')
result_df = pd.read_csv('dataframes/result_data.csv')
protein_pairs_df = pd.read_csv('dataframes/protein_pairs_data.csv')

In [17]:
interaction_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,559,560,561,562,563,564,565,566,567,568
0,CNBP_HUMAN,EBP2_HUMAN,4.376997,0.77117,0.835097,0.866124,0.538495,0.699603,0.73422,0.451208,...,17.306808,12.228814,21.40473,30.915457,6.094113,22.627251,-0.284131,9.42179,3.108872,0.86
1,DTBP1_HUMAN,BL1S1_HUMAN,4.355378,0.771795,0.977514,1.020024,0.61578,0.720567,0.676516,0.444765,...,17.071913,12.099715,20.790263,31.274987,6.148244,23.480201,-0.339447,9.58187,3.242057,0.89
2,PLK1_HUMAN,PSA3_HUMAN,4.350983,0.921458,0.979215,1.043114,0.606984,0.733716,0.684657,0.4417,...,16.864708,12.034104,20.710379,31.093584,6.085997,23.083606,-0.336897,9.525408,3.279343,0.88
3,ATG3_HUMAN,ATG12_HUMAN,4.336443,0.912624,0.937119,0.993354,0.590773,0.731851,0.681708,0.44368,...,16.242498,11.658963,19.775409,30.778781,6.057785,23.317048,-0.548705,9.540261,3.43214,0.9
4,CSN8_HUMAN,CSN2_HUMAN,4.363747,0.942411,1.021622,1.081436,0.61185,0.73512,0.679697,0.439954,...,17.005705,12.355417,20.585439,31.799906,6.121988,23.693373,-0.373932,9.704056,3.273589,0.96


## Test Train Validation Split

- 0.6 Train
- 0.2 Validation
- 0.2 Test

In [19]:
from sklearn.model_selection import train_test_split

In [21]:
def format_dataset(df):
    df = df.dropna()
    X = df.drop(['0', '1', '568'], axis=1)
    y = df['568']
    '''
    0,1 - Protein1, Protein2
    568 - Interaction Probability
    '''
    X_train, X_temp, y_train, y_temp = train_test_split(X.values, y.values, test_size=0.4, random_state=42)
    X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [24]:
X_train, X_validation, X_test, y_train, y_validation, y_test = format_dataset(interaction_df)
print(X_validation.shape, X_test.shape)

(1333, 566) (1333, 566)


## Classifiers

In [25]:
def make_class_labels(y, upper_threshold, lower_threshold):
    y_modified = [0 for _ in range(len(y))]
    for i in range(len(y)):
        if y[i] > upper_threshold:
            y_modified[i] = 1
        elif y[i] < lower_threshold:
            y_modified[i] = -1
        else:
            y_modified[i] = 0
            
    return y_modified

In [26]:
upper_threshold = 0.85
lower_threshold = 0.5

In [28]:
y_train_labels = make_class_labels(y_train, upper_threshold, lower_threshold)
y_validation_labels = make_class_labels(y_validation, upper_threshold, lower_threshold)
y_test_labels = make_class_labels(y_test, upper_threshold, lower_threshold)

In [30]:
print(np.unique(y_train_labels), np.unique(y_validation_labels), np.unique(y_test_labels))

[-1  1] [-1  1] [-1  1]


In [32]:
print(np.unique(y_train), np.unique(y_validation), np.unique(y_test))
print(y_train.shape, X_train.shape)

[0.   0.06 0.07 0.49 0.86 0.87 0.88 0.89 0.9  0.91 0.92 0.93 0.94 0.95
 0.96 0.97 0.98 0.99 1.  ] [0.   0.06 0.07 0.49 0.86 0.87 0.88 0.89 0.9  0.91 0.92 0.93 0.94 0.95
 0.96 0.97 0.98 0.99 1.  ] [0.   0.03 0.07 0.14 0.19 0.23 0.49 0.86 0.87 0.88 0.89 0.9  0.91 0.92
 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.  ]
(3997,) (3997, 566)


In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import pickle


## Scaling the features 
- Normalize (IMP)

In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_validation_scaled = scaler.fit_transform(X_validation)
X_test_scaled = scaler.transform(X_test)

## Classifiers

In [38]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

classifiers = {
    "Logistic Regression L2": LogisticRegression(penalty="l2"),
    "Random Forest Classifier Depth = 100": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=100),
    "Random Forest Classifier Depth = 50": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=50),
    "Random Forest Classifier Depth = 20": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=20),
    "Random Forest Classifier Depth = 10": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    "Random Forest Classifier Depth = 5": RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5),
    "Support Vector Classifier C = 1": SVC(kernel='linear'), 
    "Support Vector Classifier C = 5": SVC(kernel='linear'), 
    "Support Vector Classifier C = 10": SVC(kernel='linear'), 
}

for name, clf in classifiers.items():
    
    clf.fit(X_train_scaled, y_train_labels)

    y_pred_train = clf.predict(X_train_scaled)
    
    accuracy_train = accuracy_score(y_train_labels, y_pred_train)
    
    y_pred_validation = clf.predict(X_validation_scaled)
    
    accuracy_validation = accuracy_score(y_validation_labels, y_pred_validation)
    
    y_pred_test = clf.predict(X_test_scaled)
    
    accuracy_test = accuracy_score(y_test_labels, y_pred_test)
    
    print(f"Classifier: {name}")
    print(f"Train Accuracy Score: {accuracy_train:.4f}")
    print(f"Validation Accuracy Score: {accuracy_validation:.4f}")
    print(f"Test Accuracy Score: {accuracy_test:.4f}")
    print("----------------------")


Classifier: Logistic Regression L2
Train Accuracy Score: 0.6495
Validation Accuracy Score: 0.6714
Test Accuracy Score: 0.6587
----------------------
Classifier: Random Forest Classifier Depth = 100
Train Accuracy Score: 0.9997
Validation Accuracy Score: 0.7007
Test Accuracy Score: 0.7262
----------------------
Classifier: Random Forest Classifier Depth = 50
Train Accuracy Score: 0.9997
Validation Accuracy Score: 0.7007
Test Accuracy Score: 0.7262
----------------------
Classifier: Random Forest Classifier Depth = 20
Train Accuracy Score: 0.9997
Validation Accuracy Score: 0.7104
Test Accuracy Score: 0.7239
----------------------
Classifier: Random Forest Classifier Depth = 10
Train Accuracy Score: 0.9189
Validation Accuracy Score: 0.7074
Test Accuracy Score: 0.7097
----------------------
Classifier: Random Forest Classifier Depth = 5
Train Accuracy Score: 0.7508
Validation Accuracy Score: 0.6999
Test Accuracy Score: 0.7022
----------------------
Classifier: Support Vector Classifier C =

### More Classifiers 

In [40]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    "k-NN Classifier with 5 Neighbors": KNeighborsClassifier(n_neighbors=5),
    "k-NN Classifier with 10 Neighbors": KNeighborsClassifier(n_neighbors=10),
    "k-NN Classifier with 20 Neighbors": KNeighborsClassifier(n_neighbors=20),
    "Gradient Boosting Classifier (learning_rate=0.1)": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "Gradient Boosting Classifier (learning_rate=0.05)": GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42),
    "Gradient Boosting Classifier (learning_rate=0.01)": GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, random_state=42),
    "AdaBoost Classifier with n_estimators=50": AdaBoostClassifier(n_estimators=50, random_state=42),
    "AdaBoost Classifier with n_estimators=100": AdaBoostClassifier(n_estimators=100, random_state=42),
    "AdaBoost Classifier with n_estimators=200": AdaBoostClassifier(n_estimators=200, random_state=42),
    "Decision Tree Classifier with max_depth=3": DecisionTreeClassifier(max_depth=3, random_state=42),
    "Decision Tree Classifier with max_depth=5": DecisionTreeClassifier(max_depth=5, random_state=42),
    "Decision Tree Classifier with max_depth=10": DecisionTreeClassifier(max_depth=10, random_state=42)
}


for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train_labels)
    y_pred_train = clf.predict(X_train_scaled)
    accuracy_train = accuracy_score(y_train_labels, y_pred_train)
    y_pred_validation = clf.predict(X_validation_scaled)
    accuracy_validation = accuracy_score(y_validation_labels, y_pred_validation)
    y_pred_test = clf.predict(X_test_scaled)
    accuracy_test = accuracy_score(y_test_labels, y_pred_test)
    
    print(f"Classifier: {name}")
    print(f"Train Accuracy Score: {accuracy_train:.4f}")
    print(f"Validation Accuracy Score: {accuracy_validation:.4f}")
    print(f"Test Accuracy Score: {accuracy_test:.4f}")
    print("----------------------")

Classifier: k-NN Classifier with 5 Neighbors
Train Accuracy Score: 0.7933
Validation Accuracy Score: 0.6932
Test Accuracy Score: 0.6579
----------------------
Classifier: k-NN Classifier with 10 Neighbors
Train Accuracy Score: 0.7516
Validation Accuracy Score: 0.6744
Test Accuracy Score: 0.6399
----------------------
Classifier: k-NN Classifier with 20 Neighbors
Train Accuracy Score: 0.7300
Validation Accuracy Score: 0.6804
Test Accuracy Score: 0.6624
----------------------
Classifier: Gradient Boosting Classifier (learning_rate=0.1)
Train Accuracy Score: 0.8249
Validation Accuracy Score: 0.7164
Test Accuracy Score: 0.7314
----------------------
Classifier: Gradient Boosting Classifier (learning_rate=0.05)
Train Accuracy Score: 0.7671
Validation Accuracy Score: 0.7097
Test Accuracy Score: 0.7112
----------------------
Classifier: Gradient Boosting Classifier (learning_rate=0.01)
Train Accuracy Score: 0.7135
Validation Accuracy Score: 0.6917
Test Accuracy Score: 0.6842
-----------------

### Code here

In [None]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, RANSACRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Initialize regressors
regressors = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVR": SVR(),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    "Elastic Net Regression": ElasticNet(),
    "RANSAC Regression": RANSACRegressor(),
}

# Loop over regressors
for name, reg in regressors.items():
    # Train the regressor
    reg.fit(X_train, y_train)
    
    # Make predictions on the training set
    y_pred_train = reg.predict(X_train)
    
    # Calculate R^2 score for training data
    r2_train = r2_score(y_train, y_pred_train)
    
    # Make predictions on the validation set
    y_pred_validation = reg.predict(X_validation)
    
    # Calculate R^2 score for validation data
    r2_validation = r2_score(y_validation, y_pred_validation)
    
    # Make predictions on the test set
    y_pred_test = reg.predict(X_test)
    
    # Calculate R^2 score for testing data
    r2_test = r2_score(y_test, y_pred_test)
    
    # Print results
    print(f"Regressor: {name}")
    print(f"Train R^2 Score: {r2_train:.4f}")
    print(f"Validation R^2 Score: {r2_validation:.4f}")
    print(f"Test R^2 Score: {r2_test:.4f}")
    print("----------------------")


Regressor: Linear Regression
Train R^2 Score: 0.0127
Validation R^2 Score: -0.0236
Test R^2 Score: -0.0179
----------------------
Regressor: Lasso Regression
Train R^2 Score: 0.0000
Validation R^2 Score: -0.0041
Test R^2 Score: -0.0013
----------------------
Regressor: Ridge Regression
Train R^2 Score: 0.0575
Validation R^2 Score: 0.0511
Test R^2 Score: 0.0587
----------------------
Regressor: Random Forest Regressor
Train R^2 Score: 0.8777
Validation R^2 Score: 0.1238
Test R^2 Score: 0.1420
----------------------
Regressor: SVR
Train R^2 Score: -0.0018
Validation R^2 Score: 0.0257
Test R^2 Score: 0.0255
----------------------
