In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load the dataset

data = pd.read_csv(r'C:\saanvi_code\AcesoCare\Symptom2Disease.csv')

In [None]:
# Displaying the dataset

data

In [None]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
data

In [None]:
# Concise summary of DataFrame

data.info()

In [None]:
# Check for null values

data.isnull().sum()


In [None]:
# Display column names

data.columns

In [None]:
data.value_counts()

In [None]:
# Extracting 'label' and 'text' columns from the 'data' DataFrame

labels = data['label']  # Contains the labels or categories associated with the text data
symptoms = data['text']  # Contains the textual data (e.g., symptoms, sentences) for analysis

In [None]:
# Text Preprocessing

stop_words = set(stopwords.words('english'))


In [None]:
# Text Preprocessing Function

def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text.lower())
    # Removing stopwords and non-alphabetic characters
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)
# Apply preprocessing to symptoms

preprocessed_symptoms = symptoms.apply(preprocess_text)

In [None]:
# Feature Extraction using TF-IDF

tfidf_vectorizer = TfidfVectorizer(max_features=1500)  # You can adjust max_features based on your dataset size
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_symptoms).toarray()

In [None]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, labels, test_size=0.2, random_state=42)


KNN Model Training
A K-Nearest Neighbors (KNN) classifier is trained using the TF-IDF features and corresponding disease labels from the training set.


In [None]:

# KNN Model Training

knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (k) based on your dataset
knn_classifier.fit(X_train, y_train)

In [None]:
# Predictions

predictions = knn_classifier.predict(X_test)

In [None]:
# Model Evaluation

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, predictions))

Confusion Matrix for Model Evaluation


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Plotting confusion matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=labels.unique(), yticklabels=labels.unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

In [None]:
import joblib

# Save the trained KNN model
joblib.dump(knn_classifier, 'knn_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

In [None]:
# Example Usage
symptom = "i see blue color veins on my thighs"

# Preprocess the input symptom
preprocessed_symptom = preprocess_text(symptom)

# Transform the preprocessed symptom using the same vectorizer used during training
symptom_tfidf = tfidf_vectorizer.transform([preprocessed_symptom])

# Predict the disease
predicted_disease = knn_classifier.predict(symptom_tfidf)
print(f'Predicted Disease: {predicted_disease[0]}')