In [None]:
import pandas as pd

df = pd.read_csv("protein.csv")
print("Original shape:", df.shape)

# Keep only needed columns
df = df[["seq", "sst3"]].dropna()

# Ensure lengths match
df = df[df["seq"].str.len() == df["sst3"].str.len()]

# Have used only  the first N proteins to save RAM
N_PROTEINS = 400   # you can adjust: 200, 300, 400 depending on speed
df_small = df.iloc[:N_PROTEINS].reset_index(drop=True)

print("After cleaning, total proteins:", df.shape[0])
print("Subsampled proteins used:", df_small.shape[0])
print(df_small.head())


Original shape: (15079, 12)
After cleaning, total proteins: 15079
Subsampled proteins used: 400
                                                 seq  \
0  AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...   
1     TTCCPSIVARSNFNVCRLPGTPEALCATYTGCIIIPGATCPGDYAN   
2  MAKWVCKICGYIYDEDAGDPDNGISPGTKFEELPDDWVCPICGAPK...   
3  NKASVVANQLIPINTALTLIMMKAEVVTPMGIPAEEIPKLVGMQVN...   
4  ATGGYVQQATGQASFTMYSGCGSPACGKAASGFTAAINQLAFGSAP...   

                                                sst3  
0  CCCCCECCCCCHHHHHHCCECCHHHCCHHHHCCCCCCHHHCCHHHE...  
1     CEECCCHHHHHHHHHHHCCCCCHHHHHHHHCCEECCCCCCCCCCCC  
2  CCEEEECCCCCEEECCCCEHHHCECCCCCHHHCCCCCECCCCCCEH...  
3  CCCEEEECCCECCCCECCHHHEEEECCCCCCCEHHHHHHHCCCEEC...  
4  CHHHCCCCCEEEEEEEEECCCCCCCCCCCECCCEEEEEHHHHCCCC...  


In [3]:
import numpy as np

AA_LIST = "ACDEFGHIKLMNPQRSTVWY"
WINDOW_SIZE = 7   # smaller than 11 to reduce feature size
HALF = WINDOW_SIZE // 2

def seq_to_onehot(seq):
    aa_to_idx = {aa: i for i, aa in enumerate(AA_LIST)}
    mat = np.zeros((len(seq), len(AA_LIST)), dtype=np.float32)
    for i, aa in enumerate(seq):
        if aa in aa_to_idx:
            mat[i, aa_to_idx[aa]] = 1.0
    return mat

def create_windows_and_labels(seq, ss, window=WINDOW_SIZE, step=2):
    """
    step: use every 'step' residue to lower number of samples
    e.g., step=2 takes every second residue
    """
    X, y = [], []
    onehot = seq_to_onehot(seq)
    L = len(seq)
    for i in range(HALF, L - HALF, step):
        w = onehot[i - HALF : i + HALF + 1].flatten()
        X.append(w)
        y.append(ss[i])
    return np.array(X), np.array(y)

X_list, y_list = [], []

for _, row in df_small.iterrows():
    seq = row["seq"]
    ss  = row["sst3"]
    if len(seq) < WINDOW_SIZE:
        continue
    X_w, y_w = create_windows_and_labels(seq, ss, WINDOW_SIZE, step=2)  # step=2 reduces samples
    X_list.append(X_w)
    y_list.append(y_w)

X = np.vstack(X_list)
y = np.hstack(y_list)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (36936, 140)
y shape: (36936,)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
print("Classes:", le.classes_)


Train size: 29548 Test size: 7388
Classes: ['C' 'E' 'H']


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr = LogisticRegression(
    max_iter=300,    # you can increase later if needed
    n_jobs=-1
)

lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
print("Logistic Regression accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification report:")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


Logistic Regression accuracy: 0.5916350839198701

Classification report:
              precision    recall  f1-score   support

           C       0.63      0.68      0.65      3035
           E       0.54      0.47      0.50      1952
           H       0.58      0.58      0.58      2401

    accuracy                           0.59      7388
   macro avg       0.58      0.58      0.58      7388
weighted avg       0.59      0.59      0.59      7388



In [6]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(64,),
    activation='relu',
    solver='adam',
    max_iter=15,    # keep small to avoid long training
    random_state=42
)

mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)
print("MLP accuracy:", accuracy_score(y_test, y_pred_mlp))
print("\nMLP classification report:")
print(classification_report(y_test, y_pred_mlp, target_names=le.classes_))


MLP accuracy: 0.6111261505143476

MLP classification report:
              precision    recall  f1-score   support

           C       0.62      0.71      0.66      3035
           E       0.58      0.48      0.53      1952
           H       0.62      0.60      0.61      2401

    accuracy                           0.61      7388
   macro avg       0.61      0.59      0.60      7388
weighted avg       0.61      0.61      0.61      7388



