In [46]:
import pandas as pd

In [47]:
# Load the dataset
dataset_path = 'medquad.csv'  # Change this to your actual file path
medquad_df = pd.read_csv(dataset_path)
medquad_df.head()

Unnamed: 0,question,answer,source,focus_area
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma
1,What causes Glaucoma ?,"Nearly 2.7 million people have glaucoma, a lea...",NIHSeniorHealth,Glaucoma
2,What are the symptoms of Glaucoma ?,Symptoms of Glaucoma Glaucoma can develop in ...,NIHSeniorHealth,Glaucoma
3,What are the treatments for Glaucoma ?,"Although open-angle glaucoma cannot be cured, ...",NIHSeniorHealth,Glaucoma
4,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...,NIHSeniorHealth,Glaucoma


In [48]:
medquad_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   question    16412 non-null  object
 1   answer      16407 non-null  object
 2   source      16412 non-null  object
 3   focus_area  16398 non-null  object
dtypes: object(4)
memory usage: 513.0+ KB


## Data Cleaning

In [49]:
# Remove Duplicates
# Remove duplicate question-answer pairs
medquad_df.drop_duplicates(subset=['question', 'answer'], inplace=True)

In [50]:
medquad_df.shape

(16364, 4)

In [51]:
# Handle Missing Values
# Drop rows with any missing values in 'question', 'answer', or 'focus_area' columns
medquad_df.dropna(subset=['question', 'answer', 'focus_area'], inplace=True)

In [52]:
medquad_df.shape

(16345, 4)

## Text Preprocessing

In [53]:
# Case Normalization
# Convert all text to lower case
medquad_df['question'] = medquad_df['question'].str.lower()
medquad_df['answer'] = medquad_df['answer'].str.lower()

In [54]:
# Advance Text Cleaning
import re

# Function to clean text
def clean_text(text):
    # Remove special characters and digits
    text_cleaned = re.sub(r'[^a-zA-Z\s]', '', text)
    return text_cleaned

# Applying text cleaning
medquad_df['question'] = medquad_df['question'].apply(clean_text)

In [55]:
# Tokenization
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Download the necessary NLTK data

# Tokenize questions and answers
medquad_df['question_tokens'] = medquad_df['question'].apply(word_tokenize)
medquad_df['answer_tokens'] = medquad_df['answer'].apply(word_tokenize)
## time takes 1min 7s

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
# Stop Words Removal
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stop words from tokens
medquad_df['question_tokens'] = medquad_df['question_tokens'].apply(lambda tokens: [w for w in tokens if not w in stop_words])
medquad_df['answer_tokens'] = medquad_df['answer_tokens'].apply(lambda tokens: [w for w in tokens if not w in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
# Stemming/Lemmatization
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('omw-1.4')


nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Lemmatize tokens
medquad_df['question_tokens'] = medquad_df['question_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
medquad_df['answer_tokens'] = medquad_df['answer_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Transformation

In [58]:
# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming we're only vectorizing questions for now
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust `max_features` as needed
question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# 'question_vectors' can now be used as input for machine learning models

In [59]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
medquad_df['focus_area_encoded'] = label_encoder.fit_transform(medquad_df['focus_area'])

In [60]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(question_vectors, medquad_df['focus_area_encoded'], test_size=0.2, random_state=42)

# X_train and y_train can now be used for training a model, and X_test and y_test for evaluation

In [61]:
from sklearn.linear_model import LogisticRegression

# Instantiate the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy}")

## time taken 2 minutes

Accuracy: 0.27745487916794126


In [62]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    question_vectors, 
    medquad_df['focus_area_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')  # Adjust C as needed

# Training
model.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"CV Accuracy: {np.mean(cv_scores):.2f} (+/- {np.std(cv_scores) * 2:.2f})")

# Evaluation on Test Set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

print(f"Test Set Evaluation:")
print(f"Accuracy: {accuracy:.4f}")

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

# time taken- 10min- accuracy- 42%



Cross-Validation Accuracy Scores: [0.43272171 0.41835564 0.41300191 0.41606119 0.4248566 ]
CV Accuracy: 0.42 (+/- 0.01)
Test Set Evaluation:
Accuracy: 0.4304
Precision: 0.4586, Recall: 0.4304, F1-Score: 0.4297
CPU times: total: 6min 11s
Wall time: 6min 5s


In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Instantiate the model with balanced class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')  # Adjust n_estimators as needed

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

# Calculate precision, recall, and F1-score
precision, recall, fscore, _ = precision_recall_fscore_support(y_test, rf_predictions, average='weighted', zero_division=0)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

# time takes 3min - accuracy-42%

MemoryError: could not allocate 303235072 bytes

In [63]:
# %%time
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# from sklearn.decomposition import PCA
# import numpy as np
# from scipy.sparse import csr_matrix

# # Vectorization
# tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
# question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# # Convert sparse TF-IDF vectors to dense format for PCA
# question_vectors_dense = question_vectors.toarray()

# # Applying PCA
# n_components = 1000  # Adjust based on your dataset and experiment
# pca = PCA(n_components=n_components)
# question_vectors_pca = pca.fit_transform(question_vectors_dense)

# # Splitting the dataset
# X_train, X_test, y_train, y_test = train_test_split(
#     question_vectors_pca, 
#     medquad_df['focus_area_encoded'], 
#     test_size=0.2, 
#     random_state=42
# )

# # Logistic Regression Model
# model = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')

# # Training
# model.fit(X_train, y_train)

# # Cross-validation
# cv_scores = cross_val_score(model, X_train, y_train, cv=5)
# print(f"Cross-Validation Accuracy Scores: {cv_scores}")
# print(f"CV Accuracy: {np.mean(cv_scores):.2f} (+/- {np.std(cv_scores) * 2:.2f})")

# # Evaluation on Test Set
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# precision, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

# print(f"Test Set Evaluation:")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {fscore:.4f}")

# time takes 8min-accuracy-32%

In [64]:
# %%time
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression

# # Adjusting TfidfVectorizer to include more features and n-grams
# tfidf_vectorizer = TfidfVectorizer(max_features=5000,  # Increase max_features
#                                    ngram_range=(1, 3))  # Use unigrams, bigrams, and trigrams

# # Vectorize the questions with the updated settings
# question_vectors = tfidf_vectorizer.fit_transform(medquad_df['question'].values)

# # Splitting the dataset into training and testing sets again with the new vectorization
# X_train, X_test, y_train, y_test = train_test_split(question_vectors, 
#                                                     medquad_df['focus_area_encoded'], 
#                                                     test_size=0.2, 
#                                                     random_state=42)

# # Re-instantiate and train the logistic regression model with the new features
# model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence
# model.fit(X_train, y_train)

# # Evaluate the model's performance with the new features
# accuracy = model.score(X_test, y_test)
# print(f"Enhanced Model Accuracy: {accuracy}")


# ## time taken 5 minutes- accuracy- 37%

In [65]:
# %%time
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# nb_model = MultinomialNB()

# # Train the model
# nb_model.fit(X_train, y_train)

# # Predict on the test set
# nb_predictions = nb_model.predict(X_test)

# # Calculate accuracy
# nb_accuracy = accuracy_score(y_test, nb_predictions)
# print(f"Naive Bayes Accuracy: {nb_accuracy}")

# ## time taken 4 min- accuracy- 12%

In [66]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed

# # Train the model
# rf_model.fit(X_train, y_train)

# # Predict on the test set
# rf_predictions = rf_model.predict(X_test)

# # Calculate accuracy
# rf_accuracy = accuracy_score(y_test, rf_predictions)
# print(f"Random Forest Accuracy: {rf_accuracy}")

# ## time takes 44 minutes- accuracy 49%

In [67]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# svm_model = SVC(kernel='linear', C=1)  # Experiment with different kernels and C value

# # Train the model
# svm_model.fit(X_train, y_train)

# # Predict on the test set
# svm_predictions = svm_model.predict(X_test)

# # Calculate accuracy
# svm_accuracy = accuracy_score(y_test, svm_predictions)
# print(f"SVM Accuracy: {svm_accuracy}")

# ## time taken 96 minutes-accuracy 47%


In [68]:
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import accuracy_score

# # Instantiate the model
# gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # Adjust parameters as needed

# # Train the model
# gbm_model.fit(X_train, y_train)

# # Predict on the test set
# gbm_predictions = gbm_model.predict(X_test)

# # Calculate accuracy
# gbm_accuracy = accuracy_score(y_test, gbm_predictions)
# print(f"GBM Accuracy: {gbm_accuracy}")
