In [46]:
import pandas as pd

In [47]:
# Load the dataset
dataset_path = 'sampled_dataset.csv'  # Change this to your actual file path
medquad_df = pd.read_csv(dataset_path)
medquad_df.head()

Unnamed: 0,question,answer,source,focus_area
0,What are the genetic changes related to leukoe...,"LBSL is caused by mutations in the DARS2 gene,...",GHR,leukoencephalopathy with brainstem and spinal ...
1,What to do for Primary Biliary Cirrhosis ?,A healthy diet is important in all stages of c...,NIDDK,Primary Biliary Cirrhosis
2,Who is at risk for Fecal Incontinence? ?,Nearly 18 million U.S. adultsabout one in 12ha...,NIDDK,Fecal Incontinence
3,What is (are) Pervasive Developmental Disorders ?,The diagnostic category of pervasive developme...,NINDS,Pervasive Developmental Disorders
4,What are the symptoms of Crome syndrome ?,What are the signs and symptoms of Crome syndr...,GARD,Crome syndrome


In [48]:
medquad_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3282 entries, 0 to 3281
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    3282 non-null   object
 1   answer      3282 non-null   object
 2   source      3282 non-null   object
 3   focus_area  3277 non-null   object
dtypes: object(4)
memory usage: 102.7+ KB


## Data Cleaning

In [49]:
# Remove Duplicates
# Remove duplicate question-answer pairs
medquad_df.drop_duplicates(subset=['question', 'answer'], inplace=True)

In [50]:
medquad_df.shape

(3282, 4)

In [51]:
# Handle Missing Values
# Drop rows with any missing values in 'question', 'answer', or 'focus_area' columns
medquad_df.dropna(subset=['question', 'answer', 'focus_area'], inplace=True)

In [52]:
medquad_df.shape

(3277, 4)

## Text Preprocessing

In [53]:
# Case Normalization
# Convert all text to lower case
medquad_df['question'] = medquad_df['question'].str.lower()
medquad_df['answer'] = medquad_df['answer'].str.lower()

In [54]:
# Advance Text Cleaning
import re

# Function to clean text
def clean_text(text):
    # Remove special characters and digits
    text_cleaned = re.sub(r'[^a-zA-Z\s]', '', text)
    return text_cleaned

# Applying text cleaning
medquad_df['question'] = medquad_df['question'].apply(clean_text)

In [55]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the necessary NLTK data

# Tokenize questions and answers
medquad_df['question_tokens'] = medquad_df['question'].apply(word_tokenize)
medquad_df['answer_tokens'] = medquad_df['answer'].apply(word_tokenize)
## time takes 1min 7s

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
# Stop Words Removal
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stop words from tokens
medquad_df['question_tokens'] = medquad_df['question_tokens'].apply(lambda tokens: [w for w in tokens if not w in stop_words])
medquad_df['answer_tokens'] = medquad_df['answer_tokens'].apply(lambda tokens: [w for w in tokens if not w in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Stemming/Lemmatization
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('omw-1.4')


nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Lemmatize tokens
medquad_df['question_tokens'] = medquad_df['question_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
medquad_df['answer_tokens'] = medquad_df['answer_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Transformation

In [58]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming we're only vectorizing questions for now
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust `max_features` as needed
question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# 'question_vectors' can now be used as input for machine learning models

In [59]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
medquad_df['focus_area_encoded'] = label_encoder.fit_transform(medquad_df['focus_area'])

In [60]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(question_vectors, medquad_df['focus_area_encoded'], test_size=0.2, random_state=42)

# X_train and y_train can now be used for training a model, and X_test and y_test for evaluation

In [61]:
%%time
from sklearn.linear_model import LogisticRegression

# Instantiate the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

## time taken 9s

Accuracy: 0.1448170731707317
CPU times: total: 9.56 s
Wall time: 9.21 s


In [62]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    question_vectors, 
    medquad_df['focus_area_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')  # Adjust C as needed

# Training
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"CV Accuracy: {np.mean(cv_scores):.2f} (+/- {np.std(cv_scores) * 2:.2f})")

# Evaluation on Test Set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

print(f"Test Set Evaluation:")
print(f"Accuracy: {accuracy:.4f}")

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

## time takes 52s- accuracy- 32s




Cross-Validation Accuracy Scores: [0.31238095 0.34923664 0.32633588 0.28625954 0.29007634]
CV Accuracy: 0.31 (+/- 0.05)
Test Set Evaluation:
Accuracy: 0.3232
Precision: 0.3277, Recall: 0.3232, F1-Score: 0.3228
CPU times: total: 52 s
Wall time: 50.2 s


In [75]:
%%time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Assuming medquad_df['question'] contains your text data and medquad_df['focus_area_encoded'] your labels
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(medquad_df['question'])
sequences = tokenizer.texts_to_sequences(medquad_df['question'])
padded = pad_sequences(sequences, maxlen=100)  # You might adjust 'maxlen' based on your data

# Encode labels
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(medquad_df['focus_area_encoded'])
labels = to_categorical(integer_encoded)

# Split data
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

CPU times: total: 547 ms
Wall time: 1.06 s


In [77]:
%%time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 173ms/step - accuracy: 0.0000e+00 - loss: 7.7193 - val_accuracy: 0.0000e+00 - val_loss: 7.7147
Epoch 2/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 161ms/step - accuracy: 0.0013 - loss: 7.7088 - val_accuracy: 0.0030 - val_loss: 7.7514
Epoch 3/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 175ms/step - accuracy: 0.0027 - loss: 7.6392 - val_accuracy: 0.0061 - val_loss: 7.7880
Epoch 4/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 157ms/step - accuracy: 0.0047 - loss: 7.5540 - val_accuracy: 0.0030 - val_loss: 8.3953
Epoch 5/5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 158ms/step - accuracy: 0.0020 - loss: 7.4435 - val_accuracy: 0.0046 - val_loss: 8.4297
CPU times: total: 1min 32s
Wall time: 43.2 s


<keras.src.callbacks.history.History at 0x1eff3fcd990>

In [78]:
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences in the dataset
X = tokenizer(list(medquad_df['question'].values), padding=True, truncation=True, max_length=100, return_tensors='tf')

# Load pre-trained BERT model
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Build the model
input_ids = Input(shape=(100,), dtype='int32', name='input_ids')
attention_masks = Input(shape=(100,), dtype='int32', name='attention_masks')

bert_outputs = bert(input_ids, attention_mask=attention_masks).last_hidden_state
clf_output = bert_outputs[:, 0, :]
clf_output = Dense(256, activation='relu')(clf_output)
clf_output = Dropout(0.2)(clf_output)
clf_output = Dense(len(label_encoder.classes_), activation='softmax')(clf_output)

model = Model(inputs=[input_ids, attention_masks], outputs=clf_output)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Prepare inputs
train_inputs = {'input_ids': X['input_ids'], 'attention_masks': X['attention_mask']}

# Train the model
model.fit(train_inputs, y_train, batch_size=32, epochs=3, validation_split=0.2)

RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
No module named 'keras.saving.hdf5_format'

In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Instantiate the model with balanced class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')  # Adjust n_estimators as needed

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

# Calculate precision, recall, and F1-score
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, rf_predictions, average='weighted', zero_division=0)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

# time takes 3min - accuracy-42%

Random Forest Accuracy: 0.4222560975609756
Precision: 0.4055, Recall: 0.4223, F1-Score: 0.4094


In [64]:
# %%time
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression

# # Adjusting TfidfVectorizer to include more features and n-grams
# tfidf_vectorizer = TfidfVectorizer(max_features=5000,  # Increase max_features
#                                    ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# # Vectorize the questions with the updated settings
# question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# # Splitting the dataset into training and testing sets again with the new vectorization
# X_train, X_test, y_train, y_test = train_test_split(question_vectors, 
#                                                     medquad_df['focus_area_encoded'], 
#                                                     test_size=0.2, 
#                                                     random_state=42)

# # Re-instantiate and train the logistic regression model with the new features
# model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence
# model.fit(X_train, y_train)

# # Evaluate the model's performance with the new features
# accuracy = model.score(X_test, y_test)
# print(f"Enhanced Model Accuracy: {accuracy}")

# ## time taken 44s minutes

In [65]:
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # Adjust parameters as needed

# # Train the model
# gbm_model.fit(X_train, y_train) 

# # Predict on the test set
# gbm_predictions = gbm_model.predict(X_test)

# # Calculate accuracy
# gbm_accuracy = accuracy_score(y_test, gbm_predictions)
# print(f"GBM Accuracy: {gbm_accuracy}")

In [66]:
# from xgboost import XGBClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.preprocessing import LabelEncoder

# # Assuming 'X_train', 'X_test', 'y_train', 'y_test' are already prepared from the previous TF-IDF vectorization and train-test split

# # le = LabelEncoder()
# # y_train = le.fit_transform(y_train)
# # y_test_encoded = le.transform(y_test)

# # Initialize the XGBClassifier
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, learning_rate=0.1, max_depth=6)

# # Train the model
# xgb_model.fit(X_train, y_train)

# # Make predictions
# y_pred = xgb_model.predict(X_test)

# # Calculate the accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f"XGBoost Model Accuracy: {accuracy}")

In [67]:
# %%time
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# nb_model = MultinomialNB()

# # Train the model
# nb_model.fit(X_train, y_train)

# # Predict on the test set
# nb_predictions = nb_model.predict(X_test)

# # Calculate accuracy
# nb_accuracy = accuracy_score(y_test, nb_predictions)
# print(f"Naive Bayes Accuracy: {nb_accuracy}")

# ## time taken 1 min- accuracy- 12%

In [68]:
# %%time
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed

# # Train the model
# rf_model.fit(X_train, y_train)

# # Predict on the test set
# rf_predictions = rf_model.predict(X_test)

# # Calculate accuracy
# rf_accuracy = accuracy_score(y_test, rf_predictions)
# print(f"Random Forest Accuracy: {rf_accuracy}")

# ## time takes 44 minutes- accuracy 49%

In [69]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# svm_model = SVC(kernel='linear', C=1)  # Experiment with different kernels and C value

# # Train the model
# svm_model.fit(X_train, y_train)

# # Predict on the test set
# svm_predictions = svm_model.predict(X_test)

# # Calculate accuracy
# svm_accuracy = accuracy_score(y_test, svm_predictions)
# print(f"SVM Accuracy: {svm_accuracy}")

# ## time taken 96 minutes-accuracy 47%
