In [1]:
import pandas as pd
import string
import nltk
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve, accuracy_score
from sklearn.metrics import roc_auc_score

import warnings

warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
test_data = pd.read_csv('/content/drive/MyDrive/Agent_LLM_paper/drugsComTest_raw.tsv',sep = '\t')
train_data = pd.read_csv('/content/drive/MyDrive/Agent_LLM_paper/drugsComTrain_raw.tsv',sep = '\t')

In [5]:
test_data['label'] = test_data['rating'].apply(lambda x: 0 if x <= 5 else 1)
train_data['label'] = train_data['rating'].apply(lambda x: 0 if x <= 5 else 1)

In [6]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,label
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,0
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,1


In [7]:
test_data['clean_review'] = test_data['review'].str.lower()
train_data['clean_review'] = train_data['review'].str.lower()

In [8]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

train_data['clean_review'] = train_data['clean_review'].apply(lambda x: remove_stopwords(x))
train_data['clean_review'] = train_data['clean_review'].apply(lambda x: remove_spl_chars(x))

test_data['clean_review'] = test_data['clean_review'].apply(lambda x: remove_stopwords(x))
test_data['clean_review'] = test_data['clean_review'].apply(lambda x: remove_spl_chars(x))

In [9]:
df_total = pd.concat([train_data, test_data], axis=0)

In [10]:
max_length = df_total.clean_review.apply(lambda x: len(x.split())).max()

t = Tokenizer()
t.fit_on_texts(df_total.clean_review)
vocab_size = len(t.word_index) + 1

In [11]:
test_encoded = t.texts_to_sequences(test_data.clean_review)
test_padded = pad_sequences(test_encoded, maxlen=max_length, padding='post')

train_encoded = t.texts_to_sequences(train_data.clean_review)
train_padded = pad_sequences(train_encoded, maxlen=max_length, padding='post')

# GLOBAL VECTOR

In [12]:
embeddings_index = dict()
f = open('/content/drive/MyDrive/Agent_LLM_paper/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [13]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [14]:
x_train, x_test, y_train, y_test = train_padded,test_padded, train_data['label'], test_data['label']

## 1. Bi-directional LSTM

In [17]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 300, input_length=max_length, weights=[embedding_matrix]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(80)))
model.add(tf.keras.layers.Dense(80, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
model.fit(x_train, y_train, epochs = 5)

Epoch 1/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m712s[0m 140ms/step - accuracy: 0.8059 - loss: 0.4232
Epoch 2/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m709s[0m 141ms/step - accuracy: 0.9052 - loss: 0.2311
Epoch 3/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m708s[0m 140ms/step - accuracy: 0.9514 - loss: 0.1295
Epoch 4/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m708s[0m 141ms/step - accuracy: 0.9758 - loss: 0.0682
Epoch 5/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m709s[0m 141ms/step - accuracy: 0.9875 - loss: 0.0367


<keras.src.callbacks.history.History at 0x7e473014b220>

In [19]:
y_pred = model.predict(x_test)
target_names = ['negative', 'positive']
y_predictions = (y_pred > 0.5).astype(int)
predictions = pd.DataFrame(y_predictions, columns=['Prediction'])
predictions.reset_index(inplace=True)
predictions.columns = ['Index', 'Prediction']
auc = roc_auc_score(y_test, y_pred)
print(f"L'AUC est de : {auc}")
print(classification_report(y_test, predictions['Prediction'], target_names=target_names))

[1m1681/1681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 62ms/step
L'AUC est de : 0.9578571424663923
              precision    recall  f1-score   support

    negative       0.88      0.87      0.87     16207
    positive       0.94      0.95      0.95     37559

    accuracy                           0.92     53766
   macro avg       0.91      0.91      0.91     53766
weighted avg       0.92      0.92      0.92     53766



In [23]:
x_total = np.concatenate((x_train, x_test), axis=0)

In [24]:
y_pred_glove_lstm = model.predict(x_total)

[1m6721/6721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m413s[0m 61ms/step


In [25]:
import pickle
with open('/content/drive/MyDrive/Agent_LLM_paper/y_pred_glove_bilstm.pkl', 'wb') as fichier:
    pickle.dump(y_pred_glove_lstm, fichier)

## 2. CNN

In [None]:
kernel_size = 3
dim = 300

In [None]:
model2 = tf.keras.Sequential()
model2.add(tf.keras.layers.Embedding(vocab_size, dim, weights=[embedding_matrix], input_length=max_length))

model2.add(tf.keras.layers.Conv1D(256, kernel_size, activation='relu'))
model2.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model2.add(tf.keras.layers.Dropout(0.1))
model2.add(tf.keras.layers.Conv1D(128, kernel_size, activation='relu'))
model2.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model2.add(tf.keras.layers.Dropout(0.1))
model2.add(tf.keras.layers.Flatten())
model2.add(tf.keras.layers.Dense(64, activation = 'relu'))
model2.add(tf.keras.layers.Dropout(0.1))
model2.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))

In [None]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model2.fit(x_train, y_train, epochs = 5)

Epoch 1/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 6ms/step - accuracy: 0.8021 - loss: 0.4293
Epoch 2/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.8927 - loss: 0.2595
Epoch 3/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.9375 - loss: 0.1623
Epoch 4/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.9659 - loss: 0.0923
Epoch 5/5
[1m5041/5041[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 6ms/step - accuracy: 0.9787 - loss: 0.0579


<keras.src.callbacks.history.History at 0x789a20186ad0>

In [None]:
y_pred = model2.predict(x_test)

[1m1681/1681[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [None]:
target_names = ['negative', 'positive']
y_predictions = (y_pred > 0.5).astype(int)
predictions = pd.DataFrame(y_predictions, columns=['Prediction'])
predictions.reset_index(inplace=True)
predictions.columns = ['Index', 'Prediction']
auc = roc_auc_score(y_test, y_pred)
print(f"L'AUC est de : {auc}")
print(classification_report(y_test, predictions['Prediction'], target_names=target_names))

L'AUC est de : 0.9598573331521681
              precision    recall  f1-score   support

    negative       0.89      0.85      0.87     16207
    positive       0.93      0.95      0.94     37559

    accuracy                           0.92     53766
   macro avg       0.91      0.90      0.91     53766
weighted avg       0.92      0.92      0.92     53766



## Mean Embedding

In [None]:
def compute_mean_embedding(text_data, tokenizer, embeddings_index, embedding_dim=300):
    embeddings = []
    for text in text_data:
        tokens = tokenizer.texts_to_sequences([text])[0]
        word_embeddings = [embeddings_index.get(tokenizer.index_word[i], np.zeros(embedding_dim))
                           for i in tokens]
        if word_embeddings:
            embeddings.append(np.mean(word_embeddings, axis=0))

        else:
            embeddings.append(np.zeros(embedding_dim))
    return np.array(embeddings)

In [None]:
x_train_mean = compute_mean_embedding(train_data.clean_review, t, embeddings_index)
x_test_mean = compute_mean_embedding(test_data.clean_review, t, embeddings_index)

# 3. DNN

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout

In [None]:
input_dim = x_train_mean.shape[1]

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(input_dim,)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x_train_mean,y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b88505f5d80>

In [None]:
y_pred = model.predict(x_test_mean)



In [None]:
target_names = ['negative', 'positive']
y_predictions = (y_pred > 0.5).astype(int)
predictions = pd.DataFrame(y_predictions, columns=['Prediction'])
predictions.reset_index(inplace=True)
predictions.columns = ['Index', 'Prediction']
auc = roc_auc_score(y_test, y_pred)
print(f"L'AUC est de : {auc}")
print(classification_report(y_test, predictions['Prediction'], target_names=target_names))

L'AUC est de : 0.8418188279025356
              precision    recall  f1-score   support

    negative       0.68      0.61      0.64     16207
    positive       0.84      0.87      0.86     37559

    accuracy                           0.79     53766
   macro avg       0.76      0.74      0.75     53766
weighted avg       0.79      0.79      0.79     53766



## Machine learning Algorithms approachs

# 4. RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)

In [None]:
model.fit(x_train_mean, y_train)

In [None]:
model.score(x_train_mean, y_train)

0.99990080410671

In [None]:
y_pred = model.predict(x_test_mean)

In [None]:
target_names = ['negative', 'positive']
y_predictions = (y_pred > 0.5).astype(int)
predictions = pd.DataFrame(y_predictions, columns=['Prediction'])
predictions.reset_index(inplace=True)
predictions.columns = ['Index', 'Prediction']
auc = roc_auc_score(y_test, y_pred)
print(f"L'AUC est de : {auc}")
print(classification_report(y_test, predictions['Prediction'], target_names=target_names))

L'AUC est de : 0.8361389811586094
              precision    recall  f1-score   support

    negative       0.94      0.69      0.80     16207
    positive       0.88      0.98      0.93     37559

    accuracy                           0.89     53766
   macro avg       0.91      0.84      0.86     53766
weighted avg       0.90      0.89      0.89     53766



# 5. ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model5 = ExtraTreesClassifier(random_state=0)

In [None]:
model5.fit(x_train_mean, y_train)

In [None]:
model5.score(x_train_mean, y_train)

0.99990080410671

In [None]:
y_pred = model5.predict(x_test_mean)

In [None]:
target_names = ['negative', 'positive']
y_predictions = (y_pred > 0.5).astype(int)
predictions = pd.DataFrame(y_predictions, columns=['Prediction'])
predictions.reset_index(inplace=True)
predictions.columns = ['Index', 'Prediction']
auc = roc_auc_score(y_test, y_pred)
print(f"L'AUC est de : {auc}")
print(classification_report(y_test, predictions['Prediction'], target_names=target_names))

L'AUC est de : 0.8271125763797573
              precision    recall  f1-score   support

    negative       0.97      0.66      0.79     16207
    positive       0.87      0.99      0.93     37559

    accuracy                           0.89     53766
   macro avg       0.92      0.83      0.86     53766
weighted avg       0.90      0.89      0.89     53766

