In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
file_path = 'D:\Github portfolio\Hemophilia\Hemophilia-Severity-Predictor\RNN_dataset_creation\RNN_dataset.csv'
data = pd.read_csv(file_path)

# Inspect the dataset
print(data.head())


    Type    Effect  cDNA  Amino acid (HGVS)  Amino acid (Legacy)  \
0  Point  Missense   216                 72                   53   
1  Point  Missense     1                  1                  -19   
2  Point  Missense     1                  1                  -19   
3  Point  Missense     1                  1                  -19   
4  Point  Missense     1                  1                  -19   

           Domain Locationingene Severity nitroBaseBef nitroBaseAft  pos  \
0              A1         Exon 2     Mild            A            C   72   
1  Signal Peptide         Exon 1   Severe            A            G    1   
2  Signal Peptide         Exon 1   Severe            A            G    1   
3  Signal Peptide         Exon 1   Severe            A            G    1   
4  Signal Peptide         Exon 1   Severe            A            G    1   

   wrf_encoded  new_encoded    p  f_encoded  l_encoded seqBefore seqAfter  
0            2            2  216          0          1    

In [3]:
# Assuming 'seqBefore', 'seqAfter', and 'severity' are the relevant columns
seq_before = data['seqBefore'].values
seq_after = data['seqAfter'].values
severity = data['Severity'].values

# Encode the sequences (you might need to tokenize them properly based on your data)
# For simplicity, here we assume each character represents an element in the sequence
def encode_sequence(seq):
    return [ord(char) for char in seq]

seq_before_encoded = [encode_sequence(seq) for seq in seq_before]
seq_after_encoded = [encode_sequence(seq) for seq in seq_after]

# Pad sequences to the same length
max_len = max(max(len(seq) for seq in seq_before_encoded), max(len(seq) for seq in seq_after_encoded))
seq_before_padded = pad_sequences(seq_before_encoded, maxlen=max_len, padding='post')
seq_after_padded = pad_sequences(seq_after_encoded, maxlen=max_len, padding='post')

# Encode the severity labels
label_encoder = LabelEncoder()
severity_encoded = label_encoder.fit_transform(severity)
severity_encoded = np.expand_dims(severity_encoded, axis=-1)

# One-hot encode the severity labels
one_hot_encoder = OneHotEncoder(sparse=False)
severity_one_hot = one_hot_encoder.fit_transform(severity_encoded)

# Combine the sequences (you can also choose to use them separately)
combined_sequences = np.concatenate((seq_before_padded, seq_after_padded), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_sequences, severity_one_hot, test_size=0.2, random_state=42)


KeyError: 'severity'

In [None]:
# Build the RNN model
model = Sequential()
model.add(Embedding(input_dim=256, output_dim=64, input_length=combined_sequences.shape[1]))
model.add(SimpleRNN(units=128, return_sequences=False))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

# Save the model
model.save('rnn_model.h5')
