# Case Study: Rate My Answer!

In this notebook, we pre-process answers posted in a health forum and create classifiers that predict whether an answer is posted by a medical expert (physician) or not.

## 1. Import packages and load the data:

In [None]:
%%capture --no-display
!pip install spacy # install spaCy
# !pip install tqdm # install tqdm package to display the progress
# !pip uninstall tensorflow -y
# !pip install tensorflow # install tensorflow for deep learning
!pip install keras # install keras for deep learning
!pip install googletrans==4.0.0-rc1
!pip install optuna

In [None]:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

# scikit‑learn
import sklearn
from sklearn import tree
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import losses, layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

# Keras standalone
from keras.models import Model, Sequential
from keras.layers import (
    LSTM, Activation, Dense, Dropout, Input,
    Embedding, Normalization, BatchNormalization
)
from keras.callbacks import EarlyStopping
from keras.preprocessing import sequence
from keras import layers as keras_layers
from keras.backend import clear_session
from keras.optimizers import *
from keras.utils import pad_sequences

# spaCy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy

# custom utilities
from custom_funcs import *


In [None]:
train = pd.read_csv('train_RateMyAnswer.csv', sep = "|")
test = pd.read_csv('test_RateMyAnswer.csv', sep = "|")

### 2. Text Pre-processing

We can use spaCy's powerful tokenizer to parse our text:

In [None]:
# Add new stop words: 
customize_stop_words = [
    'user','answer'
]

# Mark them as stop words
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True
    
tqdm.pandas() # To display the progress
train['pr_answer'] = train.answer.progress_apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text) 
                                                   if not token.is_stop and token.is_alpha))

test['pr_answer'] = test.answer.progress_apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text) 
                                                   if not token.is_stop and token.is_alpha))

In [None]:
train.head(2)

In [None]:
sns.catplot(x="label", kind="count", palette="ch:.25", data=train)

## Over Sample 

In [None]:
model_en_to_fr = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
tokenizer_en_to_fr = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr')
model_fr_to_en = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-fr-en')
tokenizer_fr_to_en = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fr-en')

def back_translate(text):
    encoded_en = tokenizer_en_to_fr(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model_en_to_fr.generate(**encoded_en)
    french_text = tokenizer_en_to_fr.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    encoded_fr = tokenizer_fr_to_en(french_text, return_tensors="pt", padding=True, truncation=True)
    back_translated_tokens = model_fr_to_en.generate(**encoded_fr)
    back_translated_text = tokenizer_fr_to_en.batch_decode(back_translated_tokens, skip_special_tokens=True)[0]
    
    return back_translated_text


In [None]:
if os.path.exists("augmented_train_set.pkl"):
    print("Augmented training set already saved. Loading from pickle...")
    train = pd.read_pickle("augmented_train_set.pkl")
else:

    minority_df = train[train['label'] == 1]
    
    augmented_samples = []
    for text in minority_df['pr_answer']:
        augmented_text = back_translate(text)
        augmented_samples.append(augmented_text)

    augmented_df = pd.DataFrame({
        'pr_answer': augmented_samples,
        'label': [1] * len(augmented_samples)
    })

    train = pd.concat([train, augmented_df], ignore_index=True)

    train.to_pickle("augmented_train_set.pkl")
    
    print("Augmented training set created and saved.")
    print("Original data count:", len(train))
    print("Balanced data count:", len(train))


In [None]:
sns.catplot(x="label", kind="count", palette="ch:.25", data=train)

### 3. Deep Learning Algorithm (LSTM):

LSTM expects the data to be in a specific format. Therefore, instead of using the DTM, we use the original data to create sequences that are processed such that LSTM would accept them as inputs:

In [None]:
X_train = train.pr_answer
X_test = test.pr_answer

le = LabelEncoder()

Y_train = train.label
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1) # This is a data transformation for the LSTM model 

Y_test = test.label
Y_test = le.fit_transform(Y_test)
Y_test = Y_test.reshape(-1,1) # This is a data transformation for the LSTM model 

In [None]:
# Get the number of unique words in pr_answer:
from collections import Counter
results = Counter()
train['pr_answer'].str.lower().str.split().apply(results.update)
print("Number of unique words in pr_answer:", len(results))

In [None]:
# Histogram of number of words in each answer:
train['pr_answer'].str.lower().apply(lambda x: len(x.split())).hist(bins=100)

In [None]:
def objective(trial):
    max_words = trial.suggest_int('max_words', 5000, 20000, step=500)
    max_len   = trial.suggest_int('max_len',   50,   200,   step=5)

    tok = Tokenizer(num_words=max_words)
    tok.fit_on_texts(X_train)
    seq = tok.texts_to_sequences(X_train)
    seq_matrix = pad_sequences(seq, maxlen=max_len, padding='post')

    n_layers     = trial.suggest_int('n_layers',     1,   5)
    rnn_units    = trial.suggest_int('rnn_units',   16, 256, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1)
    lr           = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)

    model = Sequential()
    model.add(Input(shape=(max_len,)))
    model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
    for i in range(n_layers):
        return_seq = (i < n_layers - 1)
        model.add(Bidirectional(LSTM(rnn_units, return_sequences=return_seq)))
        model.add(Normalization())
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        loss='binary_crossentropy',
        optimizer=AdamW(learning_rate=lr)
    )
    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(
        seq_matrix, Y_train,
        epochs=20, batch_size=32,
        validation_split=0.2,
        callbacks=[es],
        verbose=0
    )

    return min(history.history['val_loss'])

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best = study.best_params
print("Best hyperparameters:")
print(best)

model_path = 'best_modela.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        final_model = pickle.load(f)
    print("Loaded existing model.")
else:
    tok = Tokenizer(num_words=best['max_words'])
    tok.fit_on_texts(X_train)
    seq = tok.texts_to_sequences(X_train)
    seq_matrix = pad_sequences(seq, maxlen=best['max_len'], padding='post')

    final_model = Sequential()
    final_model.add(Input(shape=(best['max_len'],)))
    final_model.add(Embedding(input_dim=best['max_words'], output_dim=64, input_length=best['max_len']))
    for i in range(best['n_layers']):
        return_seq = (i < best['n_layers'] - 1)
        final_model.add(Bidirectional(LSTM(best['rnn_units'], return_sequences=return_seq)))
        final_model.add(Normalization())
        final_model.add(Dropout(best['dropout_rate']))
    final_model.add(Dense(1, activation='sigmoid'))
    final_model.compile(
        loss='binary_crossentropy',
        optimizer=AdamW(learning_rate=best['learning_rate'])
    )

    es_final = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.003, restore_best_weights=True)
    final_model.fit(
        seq_matrix, Y_train,
        epochs=50, batch_size=32,
        validation_split=0.2,
        callbacks=[es_final],
        class_weight={0: 1., 1: 2.},
        verbose=1
    )

    with open(model_path, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"Trained and saved final model to {model_path}")


In [None]:
max_words = 20000 # maximum number of words to be used in the analysis
max_len = 75

In [None]:
sequences = tok.texts_to_sequences(X_train) # apply the tokenizer to the data
sequences_matrix = pad_sequences(sequences,maxlen=max_len, padding='post') 

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                            patience = 10, # Number of epochs with no improvement after which training will be stopped
                                            min_delta=0.003, # Minimum change in the monitored quantity to qualify as an improvement
                                            restore_best_weights=True # Whether to restore model weights from the epoch with the best value of the monitored quantity
                                           )

In [None]:
history = final_model.fit(sequences_matrix, Y_train, # Data to be used for fitting/ training the model
                    epochs=50, # Number times that the learning algorithm will work through the training data
                    batch_size=32, # Number of samples to be used in each iteration
                    verbose=1, # Whether to print the progress 
                    shuffle=True, # Shuffle the data for each epoch
                    validation_split=0.2, # The portion of samples to be used for validation (different from our test data)
                    callbacks = callback,
                    class_weight = {0: 1.,1: 2.}
                   )

In [None]:
print(history.history.keys())
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model Loss')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

We can now prepare the test data for the network:

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len, padding='post') # trim or pad the sentences

And finally, apply the LSTM model to the test data and evaluate it:

In [None]:
predictionProbabilities = final_model.predict(test_sequences_matrix).flatten()
predictions =(final_model.predict(test_sequences_matrix) > 0.5).astype("int32").flatten()

In [None]:
fpr, tpr, thresholds = roc_curve(Y_test, predictionProbabilities, pos_label = 1)

lr_auc = roc_auc_score(Y_test, predictionProbabilities)
custom_plot_roc_curve(fpr, tpr, lr_auc)
#96.223
#98.271

In [None]:
confusionMatrix = None
confusionMatrix = confusion_matrix(Y_test, predictions)
plt.rcParams['figure.figsize'] = (5, 5)
displayConfusionMatrix(confusionMatrix)

In [None]:
test['predicted_score'] = predictionProbabilities
test.sort_values(by = "predicted_score", ascending = False)

### 4. Apply to Kaggle data:

In [None]:
# Read Kaggle data:
kaggle = pd.read_csv('kaggle_RateMyAnswer.csv', sep = "|")

# Pre-process the answers:
kaggle['pr_answer'] = kaggle.answer.progress_apply(lambda text: 
                                          " ".join(token.lemma_ for token in nlp(text) 
                                                   if not token.is_stop and token.is_alpha))

# Create the sequence matrix:
kaggle_sequences = tok.texts_to_sequences(kaggle['pr_answer'])
kaggle_sequences_matrix = pad_sequences(kaggle_sequences,maxlen=max_len, padding='post')

In [None]:
kaggle['Expected'] = final_model.predict(kaggle_sequences_matrix).flatten()

In [None]:
kaggle[['index','Expected']].to_csv("to_Kaggle_RMA.csv", index = False)