<a href="https://colab.research.google.com/github/NurbolotAlt/notesapp/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/Hantz509/Bio_informatics_Final_project
%cd Bio_informatics_Final_project

Cloning into 'Bio_informatics_Final_project'...
remote: Enumerating objects: 10374, done.[K
remote: Counting objects:  33% (1/3)[Kremote: Counting objects:  66% (2/3)[Kremote: Counting objects: 100% (3/3)[Kremote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 10374 (delta 0), reused 0 (delta 0), pack-reused 10371 (from 1)[K
Receiving objects: 100% (10374/10374), 88.56 MiB | 15.87 MiB/s, done.
Resolving deltas: 100% (1243/1243), done.
Updating files: 100% (10319/10319), done.
/content/Bio_informatics_Final_project/Bio_informatics_Final_project


**Getting raw pssm files**

In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def parse_pssm_file(file_path, max_len = 200):
    pssm_matrix = []
    with open(file_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        if line.strip() and line.strip()[0].isdigit():
            parts = line.strip().split()
            if len(parts) >= 22:
                try:
                    scores = [float(x) for x in parts[2:22]]
                    if len(scores) == 20:
                        pssm_matrix.append(scores)

                    else:
                        print(f"Skipping line with wrong number of scores: {line.strip()}")
                except ValueError:
                    print(f"Skipping line with non-numeric entries: {line.strip()}")
            else:
                print(f"Skipping short line: {line.strip()}")
    if not pssm_matrix:
        print(f"Warning: Empty PSSM matrix for {file_path}")
        return np.zeros((1, 20))
    return np.array(pssm_matrix)

pssm_dir = "pssm_outputs/"
pssm_files = [os.path.join(pssm_dir, f) for f in os.listdir(pssm_dir) if f.endswith(".pssm")]
print(f"Found {len(pssm_files)} PSSM files.")

X_matrices = []
for file_path in pssm_files:

    matrix = parse_pssm_file(file_path)
    X_matrices.append(matrix)

print(f"Processed {len(X_matrices)} PSSM files.")

Found 1063 PSSM files.
Skipping short line: 410     0.2670
Skipping short line: 70
Skipping short line: 70
Skipping short line: 0   0   0   0   0   0   0   0   0   0   0   0  0.00     0.00
Skipping short line: 70
Skipping short line: 0.2670
Skipping short line: 0.2670
Skipping short line: 410     0.2670
Skipping short line: 70
Skipping short line: 70
Skipping short line: 0.2670
Skipping short line: 0.0410     0.2670
Skipping short line: 0.2670
Skipping short line: 0.1303     0.3153
Skipping short line: 0.2670
Skipping short line: 0.3187
Skipping short line: 70
Skipping short line: 0   0   0  0.75 0.00
Skipping short line: 0.0410     0.2670
Skipping short line: 0.2670
Skipping short line: 0.2670
Skipping short line: 70
Skipping short line: 410     0.2670
Skipping short line: 0   0   0   0   0   0   0   0   0   0   0   0   0   0  0.00     0.00
Skipping short line: 0   0   0   0  14  0.35 0.02
Skipping short line: 0   0  58  23   0   0   0   0   0   0   0   0  19  0.39 0.02
Skipping short

**Padding**

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max([m.shape[0] for m in X_matrices])
X_padded = pad_sequences(X_matrices, maxlen=max_len, dtype='float32', padding='post', truncating='post')
print(f"Padded input shape: {X_padded.shape}")

Padded input shape: (1063, 4660, 20)


**Getting labels (y)**

In [None]:
import os

pssm_dir = "pssm_outputs/"
pssm_files = [os.path.join(pssm_dir, f) for f in os.listdir(pssm_dir) if f.endswith(".pssm")]

y = []
for file_path in pssm_files:
    file_name = os.path.basename(file_path).lower()
    if 'snare' in file_name:
        label = 1
    elif 'non' in file_name:
        label = 0
    else:
        raise ValueError(f"Cannot determine label for file: {file_name}")
    y.append(label)

print(f"Labels generated: {sum(y)} SNARE, {len(y)-sum(y)} non-SNARE")
y = np.array(y)

Labels generated: 540 SNARE, 523 non-SNARE


**SVM and KNN on raw pssm files (low accuracy) (can ignore)**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X_flat = X_padded.reshape((X_padded.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42)

svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, svm_preds))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
print("kNN Accuracy:", accuracy_score(y_test, knn_preds))

SVM Accuracy: 0.8403755868544601
kNN Accuracy: 0.7417840375586855


**SVM and KNN on csv file**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

csv_file = "final_features.csv"
data = pd.read_csv(csv_file)

X = data.drop(columns=['id', 'label']).values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
print("=== SVM Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
print("\n=== kNN Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

=== SVM Results ===
Accuracy: 0.9409571162212554
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1462
           1       0.98      0.36      0.53       147

    accuracy                           0.94      1609
   macro avg       0.96      0.68      0.75      1609
weighted avg       0.94      0.94      0.93      1609


=== kNN Results ===
Accuracy: 0.930391547545059
              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1462
           1       0.83      0.30      0.44       147

    accuracy                           0.93      1609
   macro avg       0.88      0.65      0.70      1609
weighted avg       0.92      0.93      0.92      1609



**CNN**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_padded.shape[1], X_padded.shape[2])),
    MaxPooling1D(2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_padded, y, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 447ms/step - accuracy: 0.5949 - loss: 3.4423 - val_accuracy: 0.8732 - val_loss: 0.6860
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 400ms/step - accuracy: 0.8903 - loss: 0.2903 - val_accuracy: 0.8638 - val_loss: 0.3779
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 420ms/step - accuracy: 0.9577 - loss: 0.1118 - val_accuracy: 0.9061 - val_loss: 0.2806
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 419ms/step - accuracy: 0.9987 - loss: 0.0358 - val_accuracy: 0.9108 - val_loss: 0.2657
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 372ms/step - accuracy: 1.0000 - loss: 0.0195 - val_accuracy: 0.9014 - val_loss: 0.2642
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 604ms/step - accuracy: 1.0000 - loss: 0.0096 - val_accuracy: 0.9014 - val_loss: 0.2652
Epoch 7/10
[1m27/27[

<keras.src.callbacks.history.History at 0x7dbead44a450>

**mCNN**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Concatenate, Dropout

input_layer = Input(shape=(X_padded.shape[1], X_padded.shape[2]))  # (max_len, 20)

conv3 = Conv1D(64, 3, activation='relu', padding='same')(input_layer)
pool3 = MaxPooling1D(2)(conv3)
flat3 = Flatten()(pool3)

conv5 = Conv1D(64, 5, activation='relu', padding='same')(input_layer)
pool5 = MaxPooling1D(2)(conv5)
flat5 = Flatten()(pool5)

conv7 = Conv1D(64, 7, activation='relu', padding='same')(input_layer)
pool7 = MaxPooling1D(2)(conv7)
flat7 = Flatten()(pool7)

merged = Concatenate()([flat3, flat5, flat7])

dense = Dense(128, activation='relu')(merged)
dropout = Dropout(0.5)(dense)
output = Dense(1, activation='sigmoid')(dropout)

model = Model(inputs=input_layer, outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_padded, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2s/step - accuracy: 0.7033 - loss: 17.2546 - val_accuracy: 0.0000e+00 - val_loss: 4.3995
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 2s/step - accuracy: 0.7551 - loss: 0.5901 - val_accuracy: 0.1268 - val_loss: 1.3326
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2s/step - accuracy: 0.7922 - loss: 0.6956 - val_accuracy: 0.0000e+00 - val_loss: 2.1421
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2s/step - accuracy: 0.8141 - loss: 0.5143 - val_accuracy: 0.0000e+00 - val_loss: 2.2587
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - accuracy: 0.8087 - loss: 0.5063 - val_accuracy: 0.0047 - val_loss: 1.8805
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.8301 - loss: 0.4049 - val_accuracy: 0.0047 - val_loss: 1.8362
Epoch 7/10
[1m27/27[0m [

<keras.src.callbacks.history.History at 0x7dbe94018250>