In [18]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow_text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU, Embedding, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

In [56]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [38]:
data=pd.read_csv("/content/drugs _genestype dataset.csv")
data.head(20)

Unnamed: 0,Drug ID,Drug Name,Gene Type,Adverse Effect,label
0,D001,Warfarin,CYP2C9*3,Increased bleeding risk,0.0
1,D002,Clopidogrel,CYP2C19*2,Reduced drug effectiveness,0.0
2,D003,Codeine,CYP2D6 PM,Lack of pain relief,0.0
3,D004,Abacavir,HLA-B*57:01,Severe hypersensitivity,0.0
4,D005,Fluorouracil,DPYD*2A,Severe toxicity,0.0
5,P001,Atorvastatin,SLCO1B1*1A,Improved cholesterol clearance,1.0
6,P002,Losartan,AGTR1 A1166C,Enhanced blood pressure control,1.0
7,P003,Metoprolol,CYP2D6 UM,Increased drug effectiveness,1.0
8,P004,Simvastatin,HMGCR Variant,Improved lipid response,1.0
9,P005,Aspirin,ITGB3 Leu33Pro,Enhanced antiplatelet effect,1.0


In [20]:
data.shape

(476, 5)

In [25]:
data['processed_text'] = data['Drug Name'].astype(str) + " " + data['Gene Type'].astype(str)
stop_words = set(stopwords.words('english'))
data['processed_text'] = data['processed_text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [26]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_text'])

In [27]:
sequences = tokenizer.texts_to_sequences(data['processed_text'])

In [29]:
max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')

In [30]:
y = np.array(data['label'])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=256, activation='relu', return_sequences=False)))
model.add(Dense(units=1, activation="sigmoid"))



In [33]:
model.compile(optimizer=RMSprop(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model=model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_test, y_test))

Epoch 1/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 83ms/step - accuracy: 0.6531 - loss: 0.6775 - val_accuracy: 0.7188 - val_loss: nan
Epoch 2/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.6133 - loss: 0.6115 - val_accuracy: 0.8646 - val_loss: nan
Epoch 3/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.8965 - loss: 0.3339 - val_accuracy: 0.8750 - val_loss: nan
Epoch 4/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.9704 - loss: 0.1066 - val_accuracy: 0.8750 - val_loss: nan
Epoch 5/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 74ms/step - accuracy: 0.9766 - loss: 0.0866 - val_accuracy: 0.8854 - val_loss: nan
Epoch 6/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - accuracy: 0.9784 - loss: 0.0742 - val_accuracy: 0.8854 - val_loss: nan
Epoch 7/20
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ae569bcbc50>

In [35]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8490 - loss: nan 
Test Accuracy: 0.8542


In [40]:
model.save('drug_classification_model.h5')



In [54]:
def preprocess_text(text, tokenizer, max_length):
    """ Preprocess input text for prediction. """
    stop_words = set(stopwords.words('english'))

    words = word_tokenize(text)
    processed_text = ' '.join([word for word in words if word.lower() not in stop_words])

    sequence = tokenizer.texts_to_sequences([processed_text])

    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    return padded_sequence

new_text = "Warfarin	CYP2C9*3"

processed_input = preprocess_text(new_text, tokenizer, max_length)
prediction = model.predict(processed_input)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


In [55]:
if prediction[0][0]>=0.5:
  print("Can be recommended")
else:
    print("Can't be recommended")

Can't be recommended
