In [1]:
# If you don't have the moduel please install them for this notebook
# !pip install Bio

In [2]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import itertools
from Bio.SeqUtils.ProtParam import ProteinAnalysis

from scipy.stats import randint, loguniform

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
import random
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns

# Read in data

In [3]:
# Read in data
data = pd.read_csv("metadata_org_w_features.csv")
data_evaluation = pd.read_csv("testing_data_w_features.csv")

# Information and sample data

In [4]:
# information on data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Columns: 589 entries, Entry to Gravy
dtypes: float64(584), int64(1), object(4)
memory usage: 3.1+ MB


In [5]:
# describe on data
data.describe()

Unnamed: 0,SequenceLength,A,C,D,E,F,G,H,I,K,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
count,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,...,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0
mean,888.660793,0.071173,0.023348,0.044993,0.055845,0.046743,0.065586,0.022028,0.056255,0.047262,...,0.002206,99103.867857,7.097584,0.096863,42.16057,0.995021,0.304995,0.279123,0.389817,-0.051027
std,827.859592,0.022832,0.0123,0.013231,0.017925,0.015308,0.01884,0.00775,0.018204,0.017596,...,0.003108,92308.45646,1.551349,0.021673,7.412366,0.006859,0.029081,0.032611,0.043874,0.325872
min,103.0,0.024283,0.0,0.007605,0.005714,0.007782,0.016807,0.004065,0.00984,0.002915,...,0.0,11710.27,4.383673,0.033074,15.359896,0.974438,0.222539,0.18315,0.24863,-0.757339
25%,384.0,0.056695,0.015251,0.037879,0.04428,0.035479,0.052764,0.016825,0.044171,0.034237,...,0.0,42771.0835,5.723255,0.082587,37.259593,0.991538,0.284024,0.257261,0.361333,-0.284269
50%,676.0,0.06746,0.021417,0.046046,0.058594,0.044747,0.065963,0.021544,0.056726,0.048017,...,0.001377,74697.6635,6.556855,0.094744,42.218182,0.996686,0.302326,0.277,0.382857,-0.12035
75%,1015.0,0.081784,0.030753,0.053254,0.067669,0.055838,0.078212,0.026769,0.068,0.058943,...,0.003229,112207.0978,8.660054,0.109974,46.958534,0.999503,0.323615,0.300946,0.416667,0.136484
max,7096.0,0.164,0.074873,0.116598,0.123102,0.110701,0.153509,0.056093,0.125461,0.104956,...,0.031637,794048.9258,11.114366,0.180812,66.024733,1.013136,0.403005,0.419863,0.546125,1.008772


In [6]:
data.head()

Unnamed: 0,Entry,CleanSequence,Selected_PDB,ProteinClass,SequenceLength,A,C,D,E,F,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MGKAAAVVLVTLVALLGLAQADLTPKVQVYSRFPASAGTKNVLNCF...,3p73,MHC,119,0.109244,0.016807,0.058824,0.042017,0.058824,...,0.0,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,MAASAWLEAGLARVLFYPTLLYTVFRGRVRGPAHRDWYHRIDHTVL...,3rgo,Phosphatase,193,0.108808,0.015544,0.020725,0.067358,0.025907,...,0.0,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,MNLTIKEEDFTNTFMKNEESFNTFRVTKVKRWNAKRLFKILFVTVF...,3qvc,Protease,451,0.028825,0.008869,0.04878,0.068736,0.079823,...,0.002227,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,MGLPGVIPALVLRGQLLLSVLWLLGPQTSRGLVITPPGPEFVLNIS...,1aya,RTK,1098,0.043716,0.017304,0.056466,0.069217,0.030055,...,0.00365,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MALQIPSLLLLAAVVVLTVLSSPGTEGGNSERHFVHQFQPFCYFTN...,1d9k,MHC,263,0.041825,0.019011,0.022814,0.076046,0.038023,...,0.0,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


# Extract the relevant fields from the dataset

In [7]:
def extract_fields_from_data(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry', 'ProteinClass']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]
    
    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

# For data with no ProteinClass
def extract_fields_from_data_evaluation(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]
    
    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

In [8]:
(df, selected_PDB, seq, seq_L, freq, dipep,
 red_freq, red_ngram, prop) = extract_fields_from_data(data)

# Evaluation data
(df_evaluation, selected_PDB_evaluation, seq_evaluation, seq_L_evaluation,
  freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation) = extract_fields_from_data_evaluation(data_evaluation)

# You can use this section if you wanted to add more fetures to the data

In [9]:
# Feel free to add other feature to this data

# Construct Dataset with any of the extracted relevant fields

In [10]:
# This is an example of construction of the data with freq and red_freq, but you can add and make any field
Dataset = pd.concat([df, freq, red_freq, prop], axis=1)
Dataset_evaluation = pd.concat([df_evaluation, freq_evaluation, red_freq_evaluation, prop_evaluation], axis=1)

In [11]:
# Training Data with ProteinClass
Dataset.head()

Unnamed: 0,Entry,ProteinClass,A,C,D,E,F,G,H,I,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.10084,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,0.088083,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.117517,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.125683,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.098859,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


In [12]:
# Evaluation Data without ProteinClass
Dataset_evaluation.head()

Unnamed: 0,Entry,A,C,D,E,F,G,H,I,K,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,Q9LF79,0.081006,0.010242,0.054935,0.053073,0.031657,0.089385,0.01676,0.069832,0.065177,...,0.108007,116172.6688,7.86561,0.05959,33.680642,0.999017,0.323091,0.292365,0.384544,0.028026
1,P9WI81,0.108626,0.003195,0.073482,0.043131,0.023962,0.08147,0.019169,0.052716,0.028754,...,0.116613,66509.0117,5.223413,0.049521,28.408163,1.001449,0.263578,0.332268,0.34984,-0.173163
2,P04439,0.09863,0.013699,0.065753,0.063014,0.021918,0.079452,0.024658,0.035616,0.030137,...,0.128767,40840.2477,5.655275,0.090411,36.957014,1.000302,0.293151,0.265753,0.345205,-0.49863
3,Q16581,0.056017,0.03112,0.045643,0.03112,0.070539,0.045643,0.020747,0.051867,0.026971,...,0.076763,53863.6435,6.20218,0.114108,40.862697,0.989447,0.255187,0.307054,0.425311,0.204979
4,Q6QNK2,0.072082,0.020595,0.026316,0.040046,0.050343,0.064073,0.04119,0.050343,0.038902,...,0.066362,96528.7218,8.021753,0.110984,33.635481,0.990173,0.283753,0.28833,0.416476,0.15595


In [13]:
## To Do ##
# Construct your own dataset with these field and any other field that you see fit
# Make sure you do both the training and evaluation data as haveing the same fields

# Processing Steps

In [14]:
def processing_data(df):
    # Define X and y
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)
    if 0:
        print("Feature sample:")
        print(X.head())
        print("\nTarget sample:")
        print(y.head())

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    # Defined labels
    labels = np.array(['ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin', 'MHC',
       'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'])
    # Create a Label Encoder
    label_encoder = LabelEncoder()
    # Fit the label encoder on all possible labels, will make the labels [0, 1, 2, 3 ...]
    label_encoder.fit(labels)
    # Transform both training and testing labels
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)
    # print('first sample label encoded:', y_train_label_encoded[0])

    return (label_encoder, X.values, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled,
            y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)


def processing_data_evaluation(df,scaler):
    # Define X
    X = df.drop(['Entry'], axis=1)
    
    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    X_scaled = scaler.fit_transform(X)

    return (X.values, X_scaled)
    

In [15]:
(label_encoder, X, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled, y, y_label_encoded,
y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)  = processing_data(Dataset)

# Evaluation data
X_evaluation, X_scaled_evaluation = processing_data_evaluation(Dataset_evaluation, scaler)

# Creating Model

In [16]:
# random model
class random_model:
    def __init__(self):
        pass
    def fit(self,X, y):
        pass
    def predict(self,X):
        pred = []
        for x in X:
            pred.append(np.random.randint(10))
        return np.array(pred)
    def get_params(self, deep=True):
        return {}
    
# Initialize my random model
my_model = random_model()
# Initialize Logistic Regression model
lr_model = LogisticRegression()

## To Do ##
# Create 2 more models here like random forest, leanear model, etc and initialize it
# you can use sklear, torch, etc 

# Train Model

In [17]:
# Fit My Model to the training data
my_model.fit(X_train, y_train)

# Fit the model to the training data without tuning
lr_model.fit(X_train_scaled, y_train)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms

# Training and Tuning

In [18]:
# Define the parameter distribution for 'C'
param_dist_lr = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [500, 1000, 2000],
    'random_state': [42]
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_dist_lr,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit the model to the training data with tuning
random_search_lr.fit(X_train_scaled, y_train)
# Get the best estimator
best_lr_model = random_search_lr.best_estimator_
print("Best parameters found for Logistic Regression:")
print(random_search_lr.best_params_)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms
# Make sure to do tuning this time

Best parameters found for Logistic Regression:
{'C': 0.0895327624764271, 'max_iter': 500, 'multi_class': 'multinomial', 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}




# Evaluating model

In [19]:
def evaluate_model(model, X_test, y_test, model_name):

    y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n=== {model_name} ===")
    print(f"Accuracy for testing data: {accuracy:.4f}")
    return y_pred

def evaluate_model_testing(model, X_test, model_name):

    y_pred = model.predict(X_test)

    print(f"\n=== {model_name} ===")
    print("Output Evaluated")
    return y_pred

In [20]:
# Evaluate my model
y_pred_my_model = evaluate_model(my_model, X_test_scaled, y_test_label_encoded, model_name="My Random Model")
# Convert back to labels
y_pred_my_model = label_encoder.inverse_transform(y_pred_my_model)

# Evaluate Logistic Regression
y_pred_lr_model = evaluate_model(lr_model, X_test_scaled, y_test, model_name="Logistic Regression")


# Evaluate Logistic Regression Best estimator
y_pred_best_lr_model = evaluate_model(best_lr_model, X_test_scaled, y_test, model_name="Logistic Regression Best")



## To Do ##
# Evaluate all models 



=== My Random Model ===
Accuracy for testing data: 0.0949

=== Logistic Regression ===
Accuracy for testing data: 0.6642

=== Logistic Regression Best ===
Accuracy for testing data: 0.6715


## Evaluate the provided data for the competition

In [21]:
# Evaluate my model
y_pred_my_model = evaluate_model_testing(my_model, X_evaluation, model_name="My Random Model")
# Convert back to labels
y_pred_my_model_evaluation = label_encoder.inverse_transform(y_pred_my_model)


# Evaluate Logistic Regression
y_pred_lr_model_evaluation = evaluate_model_testing(lr_model, X_scaled_evaluation, model_name="Logistic Regression")


# Evaluate Logistic Regression Best Model
y_pred_lr_best_model_evaluation = evaluate_model_testing(best_lr_model, X_scaled_evaluation, model_name="Logistic Regression Best")

## To Do ##
# Evaluate all the models with the competition data



=== My Random Model ===
Output Evaluated

=== Logistic Regression ===
Output Evaluated

=== Logistic Regression Best ===
Output Evaluated


In [22]:
# Prediction of my model on testing data
print("\n=== My Model ===")
print(y_pred_my_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression ===")
print(y_pred_lr_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression Best ===")
print(y_pred_lr_best_model_evaluation[:5])

## To Do ##
# Print the first 5 examples for all model output for the competition data
# Make sure that the output is converted back into a list of string for the Protien class


=== My Model ===
['Integrin' 'RTK' 'Aquaporin' 'ATPase' 'Protease']

=== Logistic Regression ===
['ATPase' 'Phosphatase' 'MHC' 'GPCR' 'GPCR']

=== Logistic Regression Best ===
['ATPase' 'Phosphatase' 'MHC' 'GPCR' 'GPCR']


# Creat csv file for Evaluation

In [23]:
# Function to creat csv output for uploading
def save_predictions(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['Entry', 'ProteinClass'])
        for y, Entry in zip(_df['Entry'], _y_pred):
            writer.writerow([y, Entry])

In [24]:
# Saving My random model output 
save_predictions('Student_name_attempt_1.csv', y_pred_my_model_evaluation, Dataset_evaluation )
# Saving LR model output
save_predictions('Student_name_attempt_2.csv', y_pred_lr_model_evaluation, Dataset_evaluation )
# Saving Best LR model output
save_predictions('Student_name_attempt_3.csv', y_pred_lr_best_model_evaluation, Dataset_evaluation )

## To Do ##
# Save the output of your models

In [25]:
## To Do ##
# Make sure to upload at least 3 attemps to the website
# Good Luck