In [1]:
#Filtering out warnings from joblib
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='joblib')


Importing necessary libaries

In [2]:
#Importing necessary libraries

import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
import joblib

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')

# Testing tokenization
from nltk.tokenize import word_tokenize
sentence = "This is a test sentence."
tokens = word_tokenize(sentence)
print(tokens)

# Removing stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Printing results
print("Filtered Tokens:", filtered_tokens)

['This', 'is', 'a', 'test', 'sentence', '.']
Filtered Tokens: ['This', 'test', 'sentence', '.']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Loading the dataset with a different encoding
file_path = 'final_dataset.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

print(df.head())

               disease  anxiety and nervousness  depression  \
0     abdominal hernia                      0.0         0.0   
1  abscess of the lung                      0.0         0.0   
2            achalasia                      0.0         0.0   
3                 acne                      NaN         NaN   
4    actinic keratosis                      0.0         0.0   

   shortness of breath  depressive or psychotic symptoms  sharp chest pain  \
0                  0.0                               0.0               0.0   
1                  1.0                               1.0               0.0   
2                  0.0                               0.0               1.0   
3                  NaN                               NaN               NaN   
4                  0.0                               0.0               0.0   

   dizziness  insomnia  abnormal involuntary movements  chest tightness  ...  \
0        0.0       0.0                             0.0              0.0 

Separating columns 

In [4]:
# Separating non-numeric columns like 'description' and 'precaution_1' to 'precaution_4'
non_numeric_cols = ['description', 'precaution_1', 'precaution_2', 'precaution_3', 'precaution_4']
non_numeric_data = df[non_numeric_cols]

# Separating numeric symptom columns (excluding disease and non-numeric columns)
numeric_cols = [col for col in df.columns if col not in non_numeric_cols + ['disease']]
df[numeric_cols] = df[numeric_cols].fillna(0)  # Fill missing numeric symptom columns with 0


Symptom Matching Using Jaccard Similarity

In [5]:
# Define function to calculate Jaccard similarity for symptom matching
def jaccard_similarity(user_symptoms, disease_symptoms):
    intersection = np.sum(np.minimum(user_symptoms, disease_symptoms))
    union = np.sum(np.maximum(user_symptoms, disease_symptoms))
    return intersection / union

Pre processing the description column

In [6]:
# ---------------------------------------------
# Text Preprocessing: Clean and Process Description
# ---------------------------------------------

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(cleaned_tokens)

# Apply preprocessing to description column
df['processed_description'] = df['description'].apply(preprocess_text)

# Dropping rows where processed description is empty
df = df[df['processed_description'].str.strip().astype(bool)]

# ---------------------------------------------
# TF-IDF Vectorization on Processed Descriptions
# ---------------------------------------------
tfidf = TfidfVectorizer(max_features=50)  # Adjust max_features as needed
description_vectors = tfidf.fit_transform(df['processed_description'])

# Dimensionality reduction (optional but recommended)
svd = TruncatedSVD(n_components=20, random_state=42)
description_reduced = svd.fit_transform(description_vectors)


# Scale numeric symptom features
scaler = StandardScaler()
symptom_scaled = scaler.fit_transform(df[numeric_cols])  # exclude non-numeric & precautions




  df['processed_description'] = df['description'].apply(preprocess_text)


Calculating Cosine Similarity on Processed Descriptions

In [None]:
# Save the SVD-transformed description matrix
cosine_sim_matrix = description_reduced

# ---------------------------------------------
# Define the function to recommend diseases based on symptoms
# ---------------------------------------------

def recommend_disease(user_symptoms, df, cosine_sim_matrix):
    # Vectorize user symptoms
    user_symptom_vector = np.zeros(len(numeric_cols))
    for symptom in user_symptoms:
        if symptom in numeric_cols:
            user_symptom_vector[numeric_cols.index(symptom)] = 1

    # Compute Jaccard similarities
    similarities = []
    for idx, row in df.iterrows():
        disease_symptoms = row[numeric_cols].values
        jac_sim = jaccard_similarity(user_symptom_vector, disease_symptoms)
        similarities.append(jac_sim)
    similarities = np.array(similarities)
    if np.max(similarities) > 0:
        similarities /= np.max(similarities)

    # Transform user symptoms into the same vector space
    user_input_reduced = svd_transformer.transform(symptom_vectorizer.transform([user_symptom_vector]))

    # Compute cosine similarities
    cosine_sim_scores = cosine_similarity(user_input_reduced, cosine_sim_matrix).flatten()

    # Combine both similarities
    final_scores = similarities * cosine_sim_scores

    # Top 5 diseases
    top_diseases_idx = final_scores.argsort()[-5:][::-1]
    recommended_diseases = df.iloc[top_diseases_idx]['disease'].values
    
    return recommended_diseases

# ---------------------------------------------
# Example use
# ---------------------------------------------

user_symptoms = ['fever', 'headache', 'fatigue']
recommended_diseases = recommend_disease(user_symptoms, df, cosine_sim_matrix)
print("Recommended Diseases:", recommended_diseases)


Recommended Diseases: ['chronic sinusitis' 'mononucleosis' 'normal pressure hydrocephalus'
 'pituitary adenoma' 'vitamin b12 deficiency']


Dictionary for Descriptions and Precautions

In [8]:
# Include descriptions and precautions in the package
df['precautions'] = df[[f'precaution_{i}' for i in range(1, 5)]].apply(
    lambda row: [x for x in row if pd.notna(x)], axis=1
)
disease_info_dict = df.set_index('disease')[['description', 'precautions']].to_dict(orient='index')


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TF-IDF vectorizer with your data
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df['processed_description'])  # Replace `your_data` with your actual data (like descriptions)

Saving the full combined model (Jaccard + Cosine Similarity)

In [None]:
import joblib

# Save the necessary components with joblib
joblib.dump({
    'tfidf_vectorizer': tfidf,
    'svd_transformer': svd,
    'symptom_scaler': scaler,
    'jaccard_similarity_function': jaccard_similarity,
    'cosine_similarity_matrix': cosine_sim_matrix,
    'numeric_cols': numeric_cols,
    'description_column': 'processed_description',
    'disease_info_dict': disease_info_dict,
    'df': df
}, 'symptom_disease_recommendation_model.joblib')

print("Model saved successfully as 'symptom_disease_recommendation_model.joblib'.")


Model saved successfully as 'symptom_disease_recommendation_model.joblib'.


Testing

Loading the model

In [11]:

# Load the saved model
model = joblib.load('symptom_disease_recommendation_model.joblib')

# Extract necessary components from the model
tfidf = model['tfidf_vectorizer']
svd = model['svd_transformer']
scaler = model['symptom_scaler']
jaccard_similarity = model['jaccard_similarity_function']
cosine_sim = model['cosine_similarity_matrix']
numeric_cols = model['numeric_cols']
description_column = model['description_column']
disease_info_dict = ['disease_info_dict'] 


Preparing User Input

In [12]:
def recommend_diseases(user_symptoms, user_description, df):
    """
    This function recommends diseases based on user symptoms (binary) and user description (text).
    """
    
    # Initialize an array of zeros with the same length as the number of symptoms
    user_symptoms_array = np.zeros(len(numeric_cols))  # Length is the number of symptoms
    
    # Map the user symptom indices to the array (set corresponding symptoms to 1)
    for symptom in user_symptoms:
        if symptom < len(user_symptoms_array):  # Ensure the index is valid
            user_symptoms_array[symptom] = 1  # Set the corresponding symptom as present (1)

    user_symptoms_array = user_symptoms_array.reshape(1, -1)  # Reshape for the scaler

    # Scale the user symptoms (use the same scaler used during training)
    user_symptoms_scaled = scaler.transform(user_symptoms_array)

    # Convert the user description to the same format as the processed descriptions
    user_description_processed = preprocess_text(user_description)
    user_description_vector = tfidf.transform([user_description_processed])
    
    # Reduce dimensions of user description vector (same as training)
    user_description_reduced = svd.transform(user_description_vector)

    # Calculate similarity for symptoms (using Jaccard similarity)
    symptom_similarities = []
    for disease_symptoms in df[numeric_cols].values:
        similarity = jaccard_similarity(user_symptoms_scaled, disease_symptoms)
        symptom_similarities.append(similarity)
    
    # Ensure that the description similarities are calculated properly:
    description_similarities = cosine_similarity(user_description_reduced, df_tfidf_svd).flatten()

    # Combine symptom and description similarities (simple average)
    combined_similarities = (np.array(symptom_similarities) + description_similarities) / 2

    # Get top 5 recommended diseases based on combined similarity
    top_recommendations = np.argsort(combined_similarities)[::-1][:5]
    recommended_diseases = df['disease'].iloc[top_recommendations].values

    return recommended_diseases


Test With Sample Input

In [13]:
# Assuming you already have the SVD-transformed descriptions for the dataset
df_tfidf_svd = svd.transform(tfidf.transform(df['description']))

# Now you can run the recommendation function
user_symptoms = [1, 3, 7]  # Symptoms with indices
user_description = "Headache, fatigue, blurred vision"  # Free text description

recommended_diseases = recommend_diseases(user_symptoms, user_description, df)
print("Recommended Diseases:", recommended_diseases)


Recommended Diseases: ['concussion' 'acute sinusitis' 'headache after lumbar puncture'
 'atrial fibrillation' 'heart block']


