<h3>Training Disease Prediction Model Using Logistic Regression.</h3>

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# 1. Load CSV data
print("Loading dataset...")
df = pd.read_csv('dataset.csv')
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")

# 2. Handle missing values (Fill missing symptoms with "No symptom")
print("Handling missing values...")
df.fillna('No symptom', inplace=True)

# 3. Combine symptom columns into a single text column
print("Combining symptom columns into a single text column...")
df['Symptoms'] = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13' ,  'Symptom_14' ,'Symptom_15', 'Symptom_16', 'Symptom_17']].apply(lambda row: ' '.join(row), axis=1)

# 4. Encode disease names into numeric labels
print("Encoding disease labels...")
label_encoder = LabelEncoder()
df['Disease_Label'] = label_encoder.fit_transform(df['Disease'])

# Remove rows where 'Symptoms' or 'Disease' column might have NaN values
df = df.dropna(subset=['Symptoms', 'Disease'])
print(f"Dataset after cleaning: {df.shape[0]} rows.")

# Define features (X) and labels (y)
X = df['Symptoms']  # Text data (symptoms)
y = df['Disease_Label']  # Labels (numeric encoded diseases)

# 5. Split the data into training and testing sets
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {len(X_train)} samples, Test set size: {len(X_test)} samples.")

# Reset indices to avoid index mismatches
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# 6. TF-IDF Vectorization
print("Vectorizing symptoms using TF-IDF...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 7. Train Logistic Regression model
print("Training Logistic Regression model...")
model = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues occur
model.fit(X_train_tfidf, y_train)

# 8. Evaluate model on test data
print("Evaluating model on test data...")
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")




Loading dataset...
Dataset loaded with 4920 rows and 18 columns.
Handling missing values...
Combining symptom columns into a single text column...
Encoding disease labels...
Dataset after cleaning: 4920 rows.
Splitting data into training and testing sets...
Training set size: 3936 samples, Test set size: 984 samples.
Vectorizing symptoms using TF-IDF...
Training Logistic Regression model...
Evaluating model on test data...
Test Accuracy: 1.0000


<h3>Testing The model</h3>

In [21]:
# Example prediction
sample_input =  "stomach_pain, burning_micturition, spotting_ urination"
sample_tfidf = vectorizer.transform([sample_input])
predicted_label = model.predict(sample_tfidf)
predicted_disease = label_encoder.inverse_transform(predicted_label)
print(f"Predicted Disease: {predicted_disease[0]}")

Predicted Disease: Drug Reaction


<h3>Saving the Model</h3>

In [22]:
import joblib

# Save the Logistic Regression model
print("Saving Logistic Regression model...")
joblib.dump(model, 'logistic_regression_model.pkl')

# Save the TF-IDF vectorizer
print("Saving TF-IDF vectorizer...")
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save the LabelEncoder
print("Saving LabelEncoder...")
joblib.dump(label_encoder, 'label_encoder.pkl')

# If any other preprocessing steps were done, save those as well
# For example, saving the entire pipeline if used
# joblib.dump(preprocessing_pipeline, 'preprocessing_pipeline.pkl')

print("Model, vectorizer, and label encoder saved successfully.")


Saving Logistic Regression model...
Saving TF-IDF vectorizer...
Saving LabelEncoder...
Model, vectorizer, and label encoder saved successfully.


<h3>Testing the Saved Model</h3>

In [25]:
import joblib

# Load the saved Logistic Regression model
loaded_model = joblib.load('logistic_regression_model.pkl')

# Load the saved TF-IDF vectorizer
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load the saved LabelEncoder (if not already loaded)
label_encoder = joblib.load('label_encoder.pkl')

# Example of using the loaded model for prediction
sample_input = "sunken_eyes"  # Example user input
sample_tfidf = loaded_vectorizer.transform([sample_input])  # Transform input using TF-IDF

# Predict the disease label
predicted_label = loaded_model.predict(sample_tfidf)

# Decode the predicted label to get the actual disease name(s)
predicted_disease = label_encoder.inverse_transform(predicted_label)

# Output the predicted disease as a string
predicted_disease_string = predicted_disease[0]  # Since there's only one prediction, we get the first one

print(f"Predicted Disease: {predicted_disease_string}")



Predicted Disease: Gastroenteritis
