In [None]:
# If you don't have the moduel please install them for this notebook
!pip install -q Bio transformers sentencepiece torch  torchvision tensorflow tqdm fair-esm scikit-learn scipy peptides protlearn skrebate shap bayesian-optimization torch-geometric

In [None]:
import re
import numpy as np
import pandas as pd
from collections import Counter
import itertools
from Bio.SeqUtils.ProtParam import ProteinAnalysis

from scipy.stats import randint, loguniform

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
import random
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

# Change into ProtClassify folder
import os
os.chdir('/content/drive/MyDrive/ProtClassify')
print("PWD:", os.getcwd())
# Should list all your notebooks, data/, combined_with_pfam.parquet, etc.
!ls -l

# Read in data

In [None]:
# Read in data
data = pd.read_csv("metadata_org_w_features.csv")
data_evaluation = pd.read_csv("testing_data_w_features.csv")

# Information and sample data

In [None]:
# information on data
data.info()

In [None]:
# describe on data
data.describe()

In [None]:
data.head()

# Extract the relevant fields from the dataset

In [None]:
def extract_fields_from_data(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry', 'ProteinClass']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

# For data with no ProteinClass
def extract_fields_from_data_evaluation(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

In [None]:
(df, selected_PDB, seq, seq_L, freq, dipep,
 red_freq, red_ngram, prop) = extract_fields_from_data(data)

# Evaluation data
(df_evaluation, selected_PDB_evaluation, seq_evaluation, seq_L_evaluation,
  freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation) = extract_fields_from_data_evaluation(data_evaluation)

# You can use this section if you wanted to add more fetures to the data

In [None]:
# Feel free to add other feature to this data

\

In [None]:
# Part 2: Imports and Drive Mount
# -------------------------------
# Standard libraries and GPU/device handling are imported.
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch

# Hugging Face transformers (slow tokenizer fallback is enforced)
from transformers import AutoTokenizer, T5EncoderModel

# ESM2 from FAIR
import esm

In [None]:
# Part 3: Load ProtT5 and ESM2 models
# -------------------------------
# ProtT5 tokenizer and model are loaded, with slow Python tokenizer ensuring Unigram compatibility.
device = "cuda" if torch.cuda.is_available() else "cpu"

pt5_tokenizer = AutoTokenizer.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50",
    do_lower_case=False,
    use_fast=False
)
pt5 = T5EncoderModel.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50"
).to(device).eval()

# ESM2 650M is loaded along with its batch converter
esm_model, esm_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
esm_model = esm_model.to(device).eval()
esm_batch_converter = esm_alphabet.get_batch_converter()

print("ProtT5 and ESM2 models successfully loaded on", device)


In [None]:
# Part 4: Embedding helper functions
# -------------------------------
# Functions are defined to produce mean-pooled embeddings from raw sequences.

def embed_with_prott5(sequences, batch_size=16):
    """Embeddings are generated with ProtT5, mean-pooled over tokens."""
    all_embeds = []
    for i in tqdm(range(0, len(sequences), batch_size), desc="ProtT5"):
        batch = pt5_tokenizer(
            sequences[i:i+batch_size],
            return_tensors="pt",
            padding=True
        )
        inputs = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            out = pt5(**inputs).last_hidden_state
        mask = batch["attention_mask"].unsqueeze(-1).to(device)
        summed = (out * mask).sum(1)
        counts = mask.sum(1)
        all_embeds.append((summed / counts).cpu().numpy())
    return np.vstack(all_embeds)


def embed_with_esm2(sequences, batch_size=1):
    """Embeddings are generated with ESM2, average over non-pad tokens."""
    all_embeds = []
    for i in tqdm(range(0, len(sequences), batch_size), desc="ESM2"):
        batch_data = [
            (str(j), sequences[j])
            for j in range(i, min(i+batch_size, len(sequences)))
        ]
        labels, strs, toks = esm_batch_converter(batch_data)
        toks = toks.to(device)
        with torch.no_grad():
            results = esm_model(
                toks, repr_layers=[33], return_contacts=False
            )
        token_feats = results["representations"][33]
        seq_lens = (toks != esm_alphabet.padding_idx).sum(1)
        batch_embeds = []
        for j, length in enumerate(seq_lens):
            batch_embeds.append(
                token_feats[j, 1: length - 1].mean(0).cpu().numpy()
            )
        all_embeds.append(np.stack(batch_embeds))
    return np.vstack(all_embeds)


In [None]:
# Part 5: Generate and concatenate all features
# -------------------------------
# Raw feature columns (freq, dipep, reduced, n-gram, prop) are loaded and embedded sequences are computed.

# 1) Metadata CSVs are read
data       = pd.read_csv("metadata_org_w_features.csv")
eval_data  = pd.read_csv("testing_data_w_features.csv")

# 2) Sequence lists are extracted
train_seqs = data["CleanSequence"].tolist()
eval_seqs  = eval_data["CleanSequence"].tolist()

# 3) Transformer embeddings are computed
print("Embedding training sequences…")
pt5_train = embed_with_prott5(train_seqs)
esm_train = embed_with_esm2(train_seqs)

print("Embedding evaluation sequences…")
pt5_eval = embed_with_prott5(eval_seqs)
esm_eval = embed_with_esm2(eval_seqs)

# 4) Hand-crafted features are concatenated
#    (Assuming freq, dipep, red_freq, red_ngram, prop were already extracted above)
freq           = data.iloc[:,  data.columns.get_loc("SequenceLength")+1 :  data.columns.get_loc("SequenceLength")+21].values
dipep          = data.iloc[:, data.columns.get_loc("SequenceLength")+21 : data.columns.get_loc("SequenceLength")+421].values
red_freq       = data.iloc[:, data.columns.get_loc("SequenceLength")+421 : data.columns.get_loc("SequenceLength")+426].values
red_ngram      = data.iloc[:, data.columns.get_loc("SequenceLength")+426 : data.columns.get_loc("SequenceLength")+576].values
prop           = data.iloc[:, data.columns.get_loc("SequenceLength")+576 : ].values

freq_eval      = eval_data.iloc[:,  eval_data.columns.get_loc("SequenceLength")+1 :  eval_data.columns.get_loc("SequenceLength")+21].values
dipep_eval     = eval_data.iloc[:, eval_data.columns.get_loc("SequenceLength")+21 : eval_data.columns.get_loc("SequenceLength")+421].values
red_freq_eval  = eval_data.iloc[:, eval_data.columns.get_loc("SequenceLength")+421 : eval_data.columns.get_loc("SequenceLength")+426].values
red_ngram_eval = eval_data.iloc[:, eval_data.columns.get_loc("SequenceLength")+426 : eval_data.columns.get_loc("SequenceLength")+576].values
prop_eval      = eval_data.iloc[:, eval_data.columns.get_loc("SequenceLength")+576 : ].values

# 5) Final feature matrices are assembled
X_train = np.hstack([pt5_train, esm_train, freq, dipep, red_freq, red_ngram, prop])
X_eval  = np.hstack([pt5_eval,  esm_eval,  freq_eval, dipep_eval, red_freq_eval, red_ngram_eval, prop_eval])

print("Final training matrix shape:", X_train.shape)
print("Final evaluation matrix shape:", X_eval.shape)


In [None]:
# ─── CELL 1: CLEAR GPU MEMORY ────────────────────────────────────────────────
import torch, gc

# All cached GPU allocations are released.
torch.cuda.empty_cache()
# Python garbage collector is run to free any Python-level objects.
gc.collect()

print("GPU cache and Python garbage collector have been cleared.")

In [None]:
# ─── CELL 2: STANDARD IMPORTS & DEVICE SETUP ────────────────────────────────
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Hugging Face transformers for ProtT5
from transformers import AutoTokenizer, T5EncoderModel

# FAIR ESM for ESM2 embeddings
import esm

# Re-check that GPU is available (optional)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# ─── CELL 3: LOAD TRANSFORMER MODELS (FULL PRECISION) ───────────────────────

# ProtT5-XL tokenizer & encoder are loaded in full precision.
pt5_tokenizer = AutoTokenizer.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50",
    do_lower_case=False,
    use_fast=False    # use_fast=False ensures compatibility with the original Unigram tokenizer
)
pt5_encoder = T5EncoderModel.from_pretrained(
    "Rostlab/prot_t5_xl_uniref50"
).to(device).eval()  # full precision by default

# ESM2 (650M) model and alphabet are loaded for fair-esm
esm_model, esm_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
esm_model = esm_model.to(device).eval()
esm_batch_converter = esm_alphabet.get_batch_converter()

print("ProtT5 and ESM2 models have been loaded in full precision.")

In [None]:






# ─── CELL 4: EMBEDDING FUNCTIONS ──────────

def embed_with_prott5(
    sequences: list[str],
    batch_size: int = 8
) -> np.ndarray:
    """
    Generates mean-pooled ProtT5 embeddings for a list of protein sequences.

    Each sequence is tokenized, passed through the T5 encoder, and
    the last hidden states are mean-pooled over the sequence length
    (excluding padding tokens). Results are returned as an (N × D) array.

    Args:
        sequences: List of amino-acid strings.
        batch_size: Number of sequences processed per forward pass.

    Returns:
        A numpy array of shape (len(sequences), embedding_dim).
    """
    all_embeds = []
    for i in tqdm(range(0, len(sequences), batch_size), desc="ProtT5"):
        batch = pt5_tokenizer(
            sequences[i : i + batch_size],
            return_tensors="pt",
            padding=True,
            truncation=True
        )
        # Move token tensors to GPU/CPU
        inputs = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            # last_hidden_state: (B, L, D)
            out = pt5_encoder(**inputs).last_hidden_state
        # attention_mask shape (B, L) → (B, L, 1)
        mask = batch["attention_mask"].unsqueeze(-1).to(device)
        # sum token embeddings, then divide by counts
        summed = (out * mask).sum(dim=1)
        counts = mask.sum(dim=1)
        all_embeds.append((summed / counts).cpu().numpy())

    return np.vstack(all_embeds)


def embed_with_esm2(
    sequences: list[str],
    batch_size: int = 1
) -> np.ndarray:
    """
    Generates mean-pooled ESM2 embeddings for a list of protein sequences.

    Sequences are batched one at a time (to minimize GPU peak usage),
    tokenized via the ESM alphabet, and run through ESM2. Layer 33
    representations are mean-pooled over non-padding residues.

    Args:
        sequences: List of amino-acid strings.
        batch_size: Batch size (default 1 for memory safety).

    Returns:
        A numpy array of shape (len(sequences), embedding_dim).
    """
    all_embeds = []
    for i in tqdm(range(0, len(sequences), batch_size), desc="ESM2"):
        batch_data = [(str(j), sequences[j]) for j in range(i, min(i + batch_size, len(sequences)))]
        labels, strs, toks = esm_batch_converter(batch_data)
        toks = toks.to(device)
        with torch.no_grad():
            results = esm_model(toks, repr_layers=[33], return_contacts=False)
        token_feats = results["representations"][33]  # (B, L, D)
        lengths = (toks != esm_alphabet.padding_idx).sum(dim=1)

        # mean-pool per sequence, excluding start/end tokens
        batch_embeds = [
            token_feats[b, 1 : lengths[b] - 1].mean(dim=0).cpu().numpy()
            for b in range(token_feats.size(0))
        ]
        all_embeds.append(np.stack(batch_embeds))

        # free any intermediate GPU allocations
        torch.cuda.empty_cache()

    return np.vstack(all_embeds)

# ─── CELL 5: EMBED ALL SEQUENCES & SAVE TO DISK ─────────────────────────────

# 1) Read in the CSVs
data       = pd.read_csv("metadata_org_w_features.csv")
eval_data  = pd.read_csv("testing_data_w_features.csv")

# 2) Extract sequences
train_seqs = data["CleanSequence"].tolist()
eval_seqs  = eval_data["CleanSequence"].tolist()

# 3) Compute & save ProtT5 embeddings
print("▶ Embedding training sequences with ProtT5…")
pt5_train = embed_with_prott5(train_seqs)
print("▶ Embedding eval sequences with ProtT5…")
pt5_eval  = embed_with_prott5(eval_seqs)
np.save("pt5_train.npy", pt5_train)
np.save("pt5_eval.npy",  pt5_eval)

# 4) Clear models + caches before ESM2
del pt5_encoder, pt5_tokenizer
torch.cuda.empty_cache()
gc.collect()

# 5) Compute & save ESM2 embeddings
print("▶ Embedding training sequences with ESM2…")
esm_train = embed_with_esm2(train_seqs)
print("▶ Embedding eval sequences with ESM2…")
esm_eval  = embed_with_esm2(eval_seqs)
np.save("esm_train.npy", esm_train)
np.save("esm_eval.npy",  esm_eval)

# 6) Unload ESM2 model and clear caches again
del esm_model
torch.cuda.empty_cache()
gc.collect()

print("✅ All embeddings computed and saved to disk.")

# ─── CELL 6: ASSEMBLE FINAL FEATURE MATRICES ────────────────────────────────

# 1) Reload saved embeddings (into CPU RAM)
pt5_train = np.load("pt5_train.npy")
esm_train = np.load("esm_train.npy")
pt5_eval  = np.load("pt5_eval.npy")
esm_eval  = np.load("esm_eval.npy")

# 2) Hand-crafted features are extracted from the original DataFrames.
base_idx = data.columns.get_loc("SequenceLength") + 1
freq       = data.iloc[:, base_idx : base_idx + 20].values
dipep      = data.iloc[:, base_idx + 20 : base_idx + 420].values
red_freq   = data.iloc[:, base_idx + 420 : base_idx + 425].values
red_ngram  = data.iloc[:, base_idx + 425 : base_idx + 575].values
prop       = data.iloc[:, base_idx + 575 : ].values

freq_eval      = eval_data.iloc[:, base_idx : base_idx + 20].values
dipep_eval     = eval_data.iloc[:, base_idx + 20 : base_idx + 420].values
red_freq_eval  = eval_data.iloc[:, base_idx + 420 : base_idx + 425].values
red_ngram_eval = eval_data.iloc[:, base_idx + 425 : base_idx + 575].values
prop_eval      = eval_data.iloc[:, base_idx + 575 : ].values

# 3) Final feature stacks are formed
X_train = np.hstack([pt5_train, esm_train, freq, dipep, red_freq, red_ngram, prop])
X_eval  = np.hstack([pt5_eval,  esm_eval,  freq_eval, dipep_eval, red_freq_eval, red_ngram_eval, prop_eval])

print("Final training matrix shape:", X_train.shape)
print("Final evaluation matrix shape:", X_eval.shape)

# ─── NEXT STEPS ───────────────────────────────────────────────────────────────
# • A downstream classifier (e.g. MLP, XGBoost) may now be trained on X_train, y_train.
# • Final predictions on X_eval can be generated and saved for submission.


# Construct Dataset with any of the extracted relevant fields

In [None]:
# This is an example of construction of the data with freq and red_freq, but you can add and make any field
Dataset = pd.concat([df, freq, red_freq, prop], axis=1)
Dataset_evaluation = pd.concat([df_evaluation, freq_evaluation, red_freq_evaluation, prop_evaluation], axis=1)

In [None]:
# Training Data with ProteinClass
Dataset.head()

In [None]:
# Evaluation Data without ProteinClass
Dataset_evaluation.head()

In [None]:
## To Do ##
# Construct your own dataset with these field and any other field that you see fit
# Make sure you do both the training and evaluation data as haveing the same fields

# Processing Steps

In [None]:
def processing_data(df):
    # Define X and y
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)
    if 0:
        print("Feature sample:")
        print(X.head())
        print("\nTarget sample:")
        print(y.head())

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    # Defined labels
    labels = np.array(['ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin', 'MHC',
       'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'])
    # Create a Label Encoder
    label_encoder = LabelEncoder()
    # Fit the label encoder on all possible labels, will make the labels [0, 1, 2, 3 ...]
    label_encoder.fit(labels)
    # Transform both training and testing labels
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)
    # print('first sample label encoded:', y_train_label_encoded[0])

    return (label_encoder, X.values, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled,
            y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)


def processing_data_evaluation(df,scaler):
    # Define X
    X = df.drop(['Entry'], axis=1)

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    X_scaled = scaler.fit_transform(X)

    return (X.values, X_scaled)


In [None]:
(label_encoder, X, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled, y, y_label_encoded,
y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)  = processing_data(Dataset)

# Evaluation data
X_evaluation, X_scaled_evaluation = processing_data_evaluation(Dataset_evaluation, scaler)

# Creating Model

In [None]:
# random model
class random_model:
    def __init__(self):
        pass
    def fit(self,X, y):
        pass
    def predict(self,X):
        pred = []
        for x in X:
            pred.append(np.random.randint(10))
        return np.array(pred)
    def get_params(self, deep=True):
        return {}

# Initialize my random model
my_model = random_model()
# Initialize Logistic Regression model
lr_model = LogisticRegression()

## To Do ##
# Create 2 more models here like random forest, leanear model, etc and initialize it
# you can use sklear, torch, etc

# Train Model

In [None]:
# Fit My Model to the training data
my_model.fit(X_train, y_train)

# Fit the model to the training data without tuning
lr_model.fit(X_train_scaled, y_train)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms

# Training and Tuning

In [None]:
# Define the parameter distribution for 'C'
param_dist_lr = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [500, 1000, 2000],
    'random_state': [42]
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_dist_lr,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit the model to the training data with tuning
random_search_lr.fit(X_train_scaled, y_train)
# Get the best estimator
best_lr_model = random_search_lr.best_estimator_
print("Best parameters found for Logistic Regression:")
print(random_search_lr.best_params_)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms
# Make sure to do tuning this time

# Evaluating model

In [None]:
def evaluate_model(model, X_test, y_test, model_name):

    y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n=== {model_name} ===")
    print(f"Accuracy for testing data: {accuracy:.4f}")
    return y_pred

def evaluate_model_testing(model, X_test, model_name):

    y_pred = model.predict(X_test)

    print(f"\n=== {model_name} ===")
    print("Output Evaluated")
    return y_pred

In [None]:
# Evaluate my model
y_pred_my_model = evaluate_model(my_model, X_test_scaled, y_test_label_encoded, model_name="My Random Model")
# Convert back to labels
y_pred_my_model = label_encoder.inverse_transform(y_pred_my_model)

# Evaluate Logistic Regression
y_pred_lr_model = evaluate_model(lr_model, X_test_scaled, y_test, model_name="Logistic Regression")


# Evaluate Logistic Regression Best estimator
y_pred_best_lr_model = evaluate_model(best_lr_model, X_test_scaled, y_test, model_name="Logistic Regression Best")



## To Do ##
# Evaluate all models


## Evaluate the provided data for the competition

In [None]:
# Evaluate my model
y_pred_my_model = evaluate_model_testing(my_model, X_evaluation, model_name="My Random Model")
# Convert back to labels
y_pred_my_model_evaluation = label_encoder.inverse_transform(y_pred_my_model)


# Evaluate Logistic Regression
y_pred_lr_model_evaluation = evaluate_model_testing(lr_model, X_scaled_evaluation, model_name="Logistic Regression")


# Evaluate Logistic Regression Best Model
y_pred_lr_best_model_evaluation = evaluate_model_testing(best_lr_model, X_scaled_evaluation, model_name="Logistic Regression Best")

## To Do ##
# Evaluate all the models with the competition data


In [None]:
# Prediction of my model on testing data
print("\n=== My Model ===")
print(y_pred_my_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression ===")
print(y_pred_lr_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression Best ===")
print(y_pred_lr_best_model_evaluation[:5])

## To Do ##
# Print the first 5 examples for all model output for the competition data
# Make sure that the output is converted back into a list of string for the Protien class

# Creat csv file for Evaluation

In [None]:
# Function to creat csv output for uploading
def save_predictions(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['Entry', 'ProteinClass'])
        for y, Entry in zip(_df['Entry'], _y_pred):
            writer.writerow([y, Entry])

In [None]:
# Saving My random model output
save_predictions('Student_name_attempt_1.csv', y_pred_my_model_evaluation, Dataset_evaluation )
# Saving LR model output
save_predictions('Student_name_attempt_2.csv', y_pred_lr_model_evaluation, Dataset_evaluation )
# Saving Best LR model output
save_predictions('Student_name_attempt_3.csv', y_pred_lr_best_model_evaluation, Dataset_evaluation )

## To Do ##
# Save the output of your models

In [None]:
## To Do ##
# Make sure to upload at least 3 attemps to the website
# Good Luck