# NLP

In [1]:
# Import Libraries -
import os
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import string

from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.pipeline import make_pipeline
import joblib

In [None]:
# Load dataset -
df_train = pd.read_csv('drugsComTrain_raw.csv', sep=",", encoding="utf-8")
df_test = pd.read_csv('drugsComTest_raw.csv', sep=",", encoding="utf-8")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df_train.info()

In [3]:
# Change datatype of date from object to DateTime -
df_train['date'] = pd.to_datetime(df_train['date'], format='%d-%b-%y')

In [None]:
# Display Top 5 Rows -
df_train.head()

# Exploratory Data Analysis 

In [None]:
# Check for Null Values -
df_train.isnull().sum()

In [None]:
# Check for Null Values -
sns.heatmap(df_train.isnull())

In [None]:
# Drop Null records -
df_train = df_train.dropna()
df_train.isnull().sum()

In [None]:
# Value_Counts -
df_train["condition"].value_counts()

In [None]:
# Value_Counts -
df_train["drugName"].value_counts()

In [None]:
# Drop the Unwanted features -
df_train = df_train.drop("uniqueID", axis=1)
df_train.head()

In [None]:
# Bi-variate Analysis

# 1. Distribution of ratings
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=df_train, palette='coolwarm')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# 2. Most common conditions
plt.figure(figsize=(10, 6))
df_train['condition'].value_counts().head(10).plot(kind='barh', color='skyblue')
plt.title('Top 5 Most Common Conditions')
plt.xlabel('Count')
plt.ylabel('Condition')
plt.show()

# 3. Most reviewed drugs
plt.figure(figsize=(10, 6))
df_train['drugName'].value_counts().head(5).plot(kind='barh', color='lightgreen')
plt.title('Top 5 Most Reviewed Drugs')
plt.xlabel('Count')
plt.ylabel('Drug Name')
plt.show()

# 4. Helpful reviews - usefulCount distribution
plt.figure(figsize=(8, 5))
sns.histplot(df_train['usefulCount'], bins=10, color='purple')
plt.title('Distribution of Useful Counts in Reviews')
plt.xlabel('Useful Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Uni-variate Analysis
# 1. Univariate analysis of 'rating'
plt.figure(figsize=(8, 5))
sns.histplot(df_train['rating'], bins=10, kde=True, color='orange')
plt.title('Univariate Analysis: Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()

# 2. Univariate analysis of 'condition' frequency
plt.figure(figsize=(10, 6))
df_train['condition'].value_counts().head(10).plot(kind='bar', color='lightblue')
plt.title('Univariate Analysis: Top 10 Conditions by Frequency')
plt.xlabel('Condition')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# 3. Univariate analysis of 'usefulCount' distribution
plt.figure(figsize=(8, 5))
sns.boxplot(x=df_train['usefulCount'], color='green')
plt.title('Univariate Analysis: Distribution of Useful Counts')
plt.xlabel('Useful Count')
plt.show()

In [None]:
# Univariate Analysis - Descriptive Statistics

# 1. Descriptive statistics for numerical columns
print("Descriptive Statistics for Numerical Columns:")
print(df_train[['rating', 'usefulCount']].describe())

# 2. Frequency count for categorical variables
print("\nFrequency Count for Conditions:")
print(df_train['condition'].value_counts())

print("\nFrequency Count for Drug Names:")
print(df_train['drugName'].value_counts())


In [None]:
# Bivariate Analysis - Correlation and Grouped Statistics

# 1. Correlation between numerical variables
print("\nCorrelation between Rating and Useful Count:")
print(df_train[['rating', 'usefulCount']].corr())

# 2. Groupby statistics for categorical and numerical relationships

# Average rating for each condition
print("\nAverage Rating for Each Condition:")
print(df_train.groupby('condition')['rating'].mean())

# Average useful count for each drug
print("\nAverage Useful Count for Each Drug:")
print(df_train.groupby('drugName')['usefulCount'].mean())

# Rating variance for each condition
print("\nRating Variance for Each Condition:")
print(df_train.groupby('condition')['rating'].var())

# Useful count median for each drug
print("\nMedian Useful Count for Each Drug:")
print(df_train.groupby('drugName')['usefulCount'].median())

In [None]:
# Shape of dataset -
df_train.shape

# World Clouds

In [None]:
# Display dataset -
df_train.head()

In [None]:
#Now we need only condition and review 
X = df_train.drop(['drugName','rating','date','usefulCount'],axis=1)
X.head()

# Text Preprocessing // Data Cleanning

In [None]:
# Text Cleanning -
# Function to clean and preprocess the text
def preprocess_text(text):
    # Remove HTML characters like &#039; and other special characters
    text = re.sub(r'&#\d+;', '', text)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Clean all the reviews in the 'review' column
df_train['cleaned_review'] = df_train['review'].apply(preprocess_text)

# Combine all the reviews into a single text
all_reviews = ' '.join(df_train['cleaned_review'])
all_reviews

In [None]:
# Create the WordCloud
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    stopwords=all_reviews,
    colormap='coolwarm',  # Change colormap to your preference
    max_words=500,        # Limit the number of words in the cloud
    contour_color='black',  # Contour color around words
    contour_width=3
).generate(all_reviews)

# Display the Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Medication Reviews', fontsize=20)
plt.show()

In [None]:
# Clean all the reviews in the 'review' column
df_train['cleaned_drugName'] = df_train['drugName'].apply(preprocess_text)

# Combine all the reviews into a single text
all_drugName = ' '.join(df_train['cleaned_drugName'])
all_drugName

In [None]:
# Create the WordCloud
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    stopwords=all_reviews,
    colormap='coolwarm',  # Change colormap to your preference
    max_words=500,        # Limit the number of words in the cloud
    contour_color='black',  # Contour color around words
    contour_width=3
).generate(all_drugName)

# Display the Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Drug Name', fontsize=20)
plt.show()

In [None]:
# Clean all the reviews in the 'review' column
df_train['cleaned_condition'] = df_train['condition'].apply(preprocess_text)

# Combine all the reviews into a single text
all_condition= ' '.join(df_train['cleaned_condition'])
all_condition

In [None]:
# Create the WordCloud
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    stopwords=all_reviews,
    colormap='coolwarm',  # Change colormap to your preference
    max_words=500,        # Limit the number of words in the cloud
    contour_color='black',  # Contour color around words
    contour_width=3
).generate(all_condition)

# Display the Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Patient Condition', fontsize=20)
plt.show()

# Model Building 

## Model to Predict Patient Condition

In [None]:
# Value_Counts -
print(df_train['condition'].value_counts())

In [22]:
# Get the value counts for 'condition'
value_counts = df_train['condition'].value_counts()

# Filter the 'condition' column to keep only values with counts >= 40
valid_conditions = value_counts[value_counts >= 500].index

# Filter the DataFrame based on valid conditions
df_train = df_train[df_train['condition'].isin(valid_conditions)]

In [None]:
# Value_Counts -
print(df_train['condition'].value_counts())

In [None]:
def clean_reviews(reviews):
    # Define a function to clean a single review
    def clean_review(review):
        # Convert HTML entities
        review = review.replace('&#039;', "'").replace('&amp;', '&')
        # Remove special characters using regex
        return re.sub(r'[^\w\s]', '', review)

    # Apply the cleaning function to the entire series
    return reviews.apply(clean_review)

# Example usage
df_train['clean_review'] = clean_reviews(df_train['review'])


In [None]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_train['clean_review'])
y = df_train['condition']

In [None]:
print(X)

In [None]:
y

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
}

# Train and evaluate models
best_model = None
best_accuracy = 0

print(f"{'Model':<20} {'Accuracy':<10} {'Recall':<10}")
print("-" * 40)

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy and recall
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')  # Using 'macro' for multi-class recall
    
    # Print model name, accuracy, and recall
    print(f"{name:<20} {accuracy:<10.4f} {recall:<10.4f}")
    
    # Save the best model based on accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

In [None]:
# Save the best model and vectorizer using joblib
joblib.dump(best_model, 'best_patient_condition_model.pkl')
joblib.dump(vectorizer, 'best_patient_condition_vectorizer.pkl')

print(f"Best model with {best_accuracy} accuracy has been saved using joblib.")

In [None]:
# Load the saved model and vectorizer
model = joblib.load('best_patient_condition_model.pkl')
vectorizer = joblib.load('best_patient_condition_vectorizer.pkl')

In [None]:
# Function to clean a single review
def clean_review(review):
    # Convert HTML entities
    review = review.replace('&#039;', "'").replace('&amp;', '&')
    # Remove special characters using regex
    return re.sub(r'[^\w\s]', '', review)

# Function to load and predict using the best model
def predict_condition(new_review):
    # Clean the new review
    new_review_cleaned = clean_review(new_review)
    
    # Vectorize the cleaned review
    new_review_vectorized = vectorizer.transform([new_review_cleaned])
    
    # Predict the condition
    prediction = model.predict(new_review_vectorized)
    return prediction

In [None]:
# Test the prediction function
new_text = "I have severe depression and can't seem to find a medication that helps me."
predicted_condition = predict_condition(new_text)
print(f"Predicted condition: {predicted_condition}")

## Model to Predict Drug Name

In [None]:
# Load dataset -
df_train = pd.read_csv('drugsComTrain_raw.csv', sep=",", encoding="utf-8")
df_test = pd.read_csv('drugsComTest_raw.csv', sep=",", encoding="utf-8")

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df_train.info()

In [None]:
# Drop Null records -
df_train = df_train.dropna()
df_train.isnull().sum()

In [None]:
# Value_Counts -
print(df_train['drugName'].value_counts())

In [None]:
# Get the value counts for 'condition'
value_counts = df_train['drugName'].value_counts()

# Filter the 'condition' column to keep only values with counts >= 40
valid_conditions = value_counts[value_counts >= 500].index

# Filter the DataFrame based on valid conditions
df_train = df_train[df_train['drugName'].isin(valid_conditions)]

In [None]:
# Value_Counts -
print(df_train['drugName'].value_counts())

In [None]:
def clean_reviews(reviews):
    # Define a function to clean a single review
    def clean_review(review):
        # Convert HTML entities
        review = review.replace('&#039;', "'").replace('&amp;', '&')
        # Remove special characters using regex
        return re.sub(r'[^\w\s]', '', review)

    # Apply the cleaning function to the entire series
    return reviews.apply(clean_review)

# Example usage
df_train['clean_review'] = clean_reviews(df_train['review'])


In [None]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_train['clean_review'])
y = df_train['drugName']

In [None]:
print(X)

In [None]:
print(y)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

In [None]:
# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
}

# Train and evaluate models
best_model = None
best_accuracy = 0

print(f"{'Model':<20} {'Accuracy':<10} {'Recall':<10}")
print("-" * 40)

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy and recall
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')  # Using 'macro' for multi-class recall
    
    # Print model name, accuracy, and recall
    print(f"{name:<20} {accuracy:<10.4f} {recall:<10.4f}")
    
    # Save the best model based on accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

In [None]:
# Save the best model and vectorizer using joblib
joblib.dump(best_model, 'best_drug_name_model.pkl')
joblib.dump(vectorizer, 'best_drug_name_vectorizer.pkl')

print(f"Best model with {best_accuracy} accuracy has been saved using joblib.")

In [None]:
# Load the saved model and vectorizer
model = joblib.load('best_drug_name_model.pkl')
vectorizer = joblib.load('best_drug_name_vectorizer.pkl')

In [None]:
# Function to clean a single review
def clean_review(review):
    # Convert HTML entities
    review = review.replace('&#039;', "'").replace('&amp;', '&')
    # Remove special characters using regex
    return re.sub(r'[^\w\s]', '', review)

# Function to load and predict using the best model
def predict_condition(new_review):
    # Clean the new review
    new_review_cleaned = clean_review(new_review)
    
    # Vectorize the cleaned review
    new_review_vectorized = vectorizer.transform([new_review_cleaned])
    
    # Predict the condition
    prediction = model.predict(new_review_vectorized)
    return prediction

In [None]:
# Test the prediction function
new_text = "I have severe depression and can't seem to find a medication that helps me."
predicted_condition = predict_condition(new_text)
print(f"Predicted Drug Name: {predicted_condition}")

# Thank You VRS Foundation