In [44]:
#column renaming

import pandas as pd
df = pd.read_csv('dataset.csv')

df.rename(columns={df.columns[0]: 'Disease'}, inplace=True)
new_df = pd.concat([df['Disease'], df.iloc[:, 1:].apply(lambda x: ', '.join(x.dropna().astype(str)), axis=1)], axis=1)


new_df.rename(columns={new_df.columns[1]: 'Symptoms'}, inplace=True)

new_df.to_csv('updated_dataset.csv', index=False)
print(new_df.columns)
print(new_df)



Index(['Disease', 'Symptoms'], dtype='object')
                                      Disease  \
0                            Fungal infection   
1                            Fungal infection   
2                            Fungal infection   
3                            Fungal infection   
4                            Fungal infection   
...                                       ...   
4915  (vertigo) Paroymsal  Positional Vertigo   
4916                                     Acne   
4917                  Urinary tract infection   
4918                                Psoriasis   
4919                                 Impetigo   

                                               Symptoms  
0     itching,  skin_rash,  nodal_skin_eruptions,  d...  
1      skin_rash,  nodal_skin_eruptions,  dischromic...  
2     itching,  nodal_skin_eruptions,  dischromic _p...  
3             itching,  skin_rash,  dischromic _patches  
4            itching,  skin_rash,  nodal_skin_eruptions  
...             

In [2]:
#Remove duplicate rows and lower case conversion
import csv

def convert_to_lowercase(s):
    return s.lower()

# Read the CSV file and convert 'Disease' and 'Symptoms' columns to lowercase
with open('updated_dataset.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    rows = list(csv_reader)

for row in rows:
    row[0] = convert_to_lowercase(row[0])
    row[1] = convert_to_lowercase(row[1])

# Remove duplicate rows based on the entire row
unique_rows = []
seen_rows = set()

for row in rows:
    # Convert the row to a tuple to make it hashable
    row_tuple = tuple(row)
    
    if row_tuple not in seen_rows:
        unique_rows.append(row)
        seen_rows.add(row_tuple)

# Write the updated dataset with lowercase conversion and duplicate rows removed
with open('processed_dataset.csv', mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(unique_rows)

print("CSV file 'processed_dataset.csv' with lowercase 'Disease' and 'Symptoms' columns and duplicate rows removed has been updated.")


CSV file 'processed_dataset.csv' with lowercase 'Disease' and 'Symptoms' columns and duplicate rows removed has been updated.


In [29]:
#Tokenisation based on delimiters like [,   _]
rows = []
with open('processed_dataset.csv', mode='r') as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        rows.append(row)



def tokenize_symptoms(symptoms):
    tokens = []
    symptom = ""
    for char in symptoms:
        if char == ',' or char == ' ':
            if symptom:
                tokens.append(symptom)
                symptom = ""
        else:
            symptom += char
    if symptom:
        tokens.append(symptom)
    return tokens


for row in rows:
    row[1] = tokenize_symptoms(row[1])

with open('tokenized_dataset.csv', mode='w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(rows)

print("CSV file 'tokenized_dataset.csv' with 'Symptoms' column tokenized has been created.")


CSV file 'tokenized_dataset.csv' with 'Symptoms' column tokenized has been created.


In [30]:
#Stopword removal to preprocessed csv file
all_stopwords = set([
    "i", "me", "we", "our", "ours", "you", "your", "yours", "yourself", "yourselves",
 "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
 "theirs", "themselves", "this", "that", "these", "those", "am", "is", "are", "was",
 "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
 "but", "if", "or", "because", "as", "until", "while", "'of'", "at", "by", "for", "about", "against", "between",
 "into", "through", "during", "before", "after", "above", "below", "from", "up", "down", "in", "out", "on", "off",
 "over", "under", "again", "to","then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
 "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "own", "same", "so",
 "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now",
 "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn",
 "mightn", "mustn", "needn", "shouldn", "wasn", "weren", "won", "wouldn",
 "also", "always", "amazing", "anyone", "anything", "appreciate", "around", "back", 
 "best", "better", "big", "come", "could", "day", "dear", "definitely", "done" , "easy", "enjoy", "even",
 "ever","every", "everyone", "everything", "excellent", "feel", "felt", "first", 
 "free", "fun", "get", "getting", "good", "got", "great", "guy", "happy", "hard",
 "hear", "hello", "help", "hi", "hope", "idea", "keep", "kind", "know", "l",
 "large", "largely", "last", "later", "latest", "less", "lets", "long", "longer",
 "longest", "member", "members", "from",
 "mostly", "mr", "mrs", "n", "necessary", "need", "needed", "needing", "needs",
 "new", "newer", "newest" ,"nobody", "non", "none", "nowhere",
 "number", "numbers", "old", "older", "oldest", "open", "opened",
 "opening", "opens", "order", "ordered", "ordering", "orders", "others", "p",
 "parted", "parting", "parts", "per", "perhaps", "places", "point", "pointed", "pointing", "points", "possible",
 "present", "presented", "presenting", "presents", "problem", "problems", "puts", "q", "r", 
 "room", "rooms", "saw" , "second", "seconds", "seemed",
 "seeming", "sees", "several", "shall", "showed", "showing", "shows", "side", "sides", 
 "small", "smaller", "smallest", "somewhere", "state", "states", "taken","therefore",
 "thinks", "this", "though", "thoughts", "three", 
 "thus", "took", "toward", "turn", "turned", "turning", "turns", "u", 
 "upon", "used", "uses", "v", "w", "wanted", "wanting", "way",
 "ways", "wells", "went","whether", "whole",
 "whose", "with", "within", "work", "worked", "working", "works", "x", "years",
 "young", "younger", "youngest","z","zero","on","and","of",
])

import ast  

def remove_stopwords(words):
    cleaned_words = []
    for word in words:
        if word.lower() not in all_stopwords:
            cleaned_words.append(word)
    return cleaned_words


input_file = "tokenized_dataset.csv"

try:
    with open(input_file, "r", newline="", encoding="utf-8") as csv_file:
        reader = csv.reader(csv_file)
        header = next(reader)

        output_file = "preprocessed.csv"
        with open(output_file, "w", newline="") as output_csv_file:
            writer = csv.writer(output_csv_file)
            writer.writerow(header)

            for row in reader:
                if len(row) >= 2:
                    disease = row[0]

                    # Parse the string representation of the list into a Python list
                    symptoms_str = row[1]
                    symptoms_list = ast.literal_eval(symptoms_str)

                    # Remove stopwords from the list of symptoms
                    cleaned_symptoms = remove_stopwords(symptoms_list)

                    
                    cleaned_symptoms_str = ", ".join(cleaned_symptoms)

                    writer.writerow([disease, cleaned_symptoms_str])
                else:
                    print("Skipping row with insufficient data:", row)

except FileNotFoundError:
    print("File not found:", input_file)
except Exception as e:
    print("An error occurred:", str(e))

print(f"CSV file '{output_file}' with cleaned Symptoms has been created.")




CSV file 'preprocessed.csv' with cleaned Symptoms has been created.


In [31]:
#Stemming suffix removal 
suffixes = ["ing",
             "ed",
            "er", 
            "ness", 
            "s", 
            "es", 
            "ious", 
            "ment", 
            "able", 
            "ible", 
            "ize", 
            "ise", 
            "tion", 
            "al", 
            "ish"]
def stem(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word
input_file = "preprocessed.csv"
output_file = "stemmed_dataset.csv"
stemmed_data = []
with open(input_file, "r", newline="") as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        disease_name = row[0]
        symptoms = row[1].strip("[]").replace("'", "").split(", ")
        stemmed_disease_name = stem(disease_name)
        stemmed_symptoms = [stem(symptom) for symptom in symptoms]
        stemmed_data.append([stemmed_disease_name, stemmed_symptoms])


with open(output_file, "w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["Stemmed Disease", "Stemmed Symptoms"])
    writer.writerows(stemmed_data)

print(f"Stemmed data saved to '{output_file}'")


Stemmed data saved to 'stemmed_dataset.csv'


In [32]:
#lemmatization 
lemmatization_dict = {
    "dischromic": "dischromia",
    "sneez": "sneeze",
    "urination": "urinate",
    "indigestion": "indigest",
    "concentration": "concentrate",
    "irritability": "irritate",
    "constipation": "constipate",
    "eruption": "erupt", 
    "fungal infec": "fungus",
    "continuou" :"continue",
    "irritation":"irritate",
    
}

input_file = "stemmed_dataset.csv"
output_file = "lemmatized_dataset.csv"


def lemmatize_word(word):
    return lemmatization_dict.get(word, word)  


with open(input_file, "r") as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader) 

    
    with open(output_file, "w", newline="") as output_csv_file:
        writer = csv.writer(output_csv_file)
        writer.writerow(header) 

        for row in reader:
           
            disease = lemmatize_word(row[0])

            
            symptoms_str = row[1].strip().replace("'", "") 
            symptoms = symptoms_str.split(", ")
            lemmatized_symptoms = [lemmatize_word(symptom) for symptom in symptoms]

            writer.writerow([disease, ", ".join(lemmatized_symptoms)])

print(f"Lemmatized data saved to '{output_file}'")

Lemmatized data saved to 'lemmatized_dataset.csv'


In [3]:
# import csv
# import re

# # Define a regular expression pattern for sentence segmentation.
# sentence_splitter = re.compile(r'(?<=[.!?])\s+')

# # Open the CSV file and create an output file for segmented data.
# with open('lemmatized_dataset.csv', 'r') as input_file, open('segmented_dataset.csv', 'w', newline='') as output_file:
#     reader = csv.reader(input_file)
#     writer = csv.writer(output_file)

#     for row in reader:
#         # Assuming the lemmatized text is in the first column of the CSV.
#         lemmatized_text = row[0]

#         # Split the lemmatized text into sentences using the regular expression pattern.
#         sentences = sentence_splitter.split(lemmatized_text)

#         # Write the segmented sentences to the output CSV file.
#         writer.writerow(sentences)



In [1]:
# import numpy as np
# import re

# class Word2Vec:
#     def __init__(self, text_data, vector_size=100, window=5, min_count=1, sg=0):
#         self.vector_size = vector_size
#         self.window = window
#         self.min_count = min_count
#         self.sg = sg

#         # Build a vocabulary of words.
#         self.vocabulary = {}
#         for text in text_data:
#             for word in text:
#                 if word not in self.vocabulary:
#                     self.vocabulary[word] = 0
#                 self.vocabulary[word] += 1

#         # Remove words that occur less than the minimum count.
#         for word in list(self.vocabulary.keys()):
#             if self.vocabulary[word] < self.min_count:
#                 del self.vocabulary[word]

#         # Initialize the word embedding matrix.
#         self.embeddings = np.random.randn(len(self.vocabulary), vector_size)

#         # Train the Word2Vec model.
#         self.train(text_data)

#     def train(self, text_data):
#         # Create a context window for each word in the text data.
#         context_windows = []
#         for text in text_data:
#             for i in range(len(text)):
#                 context_window = []
#                 for j in range(i - self.window, i + self.window + 1):
#                     if j < 0 or j >= len(text):
#                         continue
#                     if j != i:
#                         context_window.append(text[j])
#                 context_windows.append((text[i], context_window))

#         # Update the word embedding matrix for each context window.
#         for word, context_window in context_windows:
#             word_embedding = self.embeddings[self.vocabulary[word]]
#             context_embeddings = [self.embeddings[self.vocabulary[context_word]] for context_word in context_window]

#             # Skip-gram model.
#             if self.sg == 0:
#                 for context_embedding in context_embeddings:
#                     self.update_word_embedding(word_embedding, context_embedding)

#             # Continuous Bag-of-Words (CBOW) model.
#             else:
#                 context_embedding = np.mean(context_embeddings, axis=0)
#                 self.update_word_embedding(word_embedding, context_embedding)

#     def update_word_embedding(self, word_embedding, context_embedding):
#         # Calculate the error between the word embedding and the context embedding.
#         error = word_embedding - context_embedding

#         # Update the word embedding using the error.
#         word_embedding += error * 0.01

#     def get_word_embedding(self, word):
#         return self.embeddings[self.vocabulary[word]]

#     def most_similar(self, word, topn=10):
#         word_embedding = self.get_word_embedding(word)

#         # Calculate the cosine similarity between the word embedding and all other word embeddings.
#         similarities = np.dot(self.embeddings, word_embedding)

#         # Sort the similarities in descending order.
#         sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

#         # Return the top n most similar words.
#         return [self.vocabulary[i] for i, _ in sorted_similarities[:topn]]



In [7]:

'''import numpy as np
import re

class Word2Vec:
    def __init__(self, text_data, vector_size=100, window=5, min_count=1, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg

        # Build a vocabulary of words.
        self.vocabulary = {}
        for text in text_data:
            for word in text:
                if word not in self.vocabulary:
                    self.vocabulary[word] = 0
                self.vocabulary[word] += 1

        # Remove words that occur less than the minimum count.
        for word in list(self.vocabulary.keys()):
            if self.vocabulary[word] < self.min_count:
                del self.vocabulary[word]

        # Initialize the word embedding matrix.
        self.embeddings = np.random.randn(len(self.vocabulary), vector_size)

        # Train the Word2Vec model.
        self.train(text_data)

    def train(self, text_data):
        # Create a context window for each word in the text data.
        context_windows = []
        for text in text_data:
            for i in range(len(text)):
                context_window = []
                for j in range(i - self.window, i + self.window + 1):
                    if j < 0 or j >= len(text):
                        continue
                    if j != i:
                        context_window.append(text[j])
                context_windows.append((text[i], context_window))

        # Update the word embedding matrix for each context window.
        for word, context_window in context_windows:
            word_embedding = self.embeddings[self.vocabulary[word]]
            context_embeddings = [self.embeddings[self.vocabulary[context_word]] for context_word in context_window]

            # Skip-gram model.
            if self.sg == 0:
                for context_embedding in context_embeddings:
                    self.update_word_embedding(word_embedding, context_embedding)

            # Continuous Bag-of-Words (CBOW) model.
            else:
                context_embedding = np.mean(context_embeddings, axis=0)
                self.update_word_embedding(word_embedding, context_embedding)

    def update_word_embedding(self, word_embedding, context_embedding):
        # Calculate the error between the word embedding and the context embedding.
        error = word_embedding - context_embedding

        # Update the word embedding using the error.
        word_embedding += error * 0.01

    def get_word_embedding(self, word):
        return self.embeddings[self.vocabulary[word]]

    def most_similar(self, word, topn=10):
        word_embedding = self.get_word_embedding(word)

        # Calculate the cosine similarity between the word embedding and all other word embeddings.
        similarities = np.dot(self.embeddings, word_embedding)

        # Sort the similarities in descending order.
        sorted_similarities = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

        # Return the top n most similar words.
        return [self.vocabulary[i] for i, _ in sorted_similarities[:topn]]


# Get user input for text_data
user_input = input("Enter your query: ")
text_data = [user_input.split()]  # Convert the input into a list of words

# Create an instance of the Word2Vec class
word2vec_model = Word2Vec(text_data)

# Save the word embeddings to a file
np.savetxt('word_embeddings.txt', word2vec_model.embeddings, delimiter=',')

# Test the get_word_embedding function
word_embedding = word2vec_model.get_word_embedding("word2")
print("Word Embedding for 'word2':", word_embedding)

# Test the most_similar function
similar_words = word2vec_model.most_similar("word1")
print("Most similar words to 'word1':", similar_words)'''


In [43]:
import re
from collections import Counter

def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    return text

def create_vocabulary(text_data):
    words = [word for text in text_data for word in preprocess_text(text).split()]
    return set(words)

def vectorize_text(text, vocabulary):
    vector = Counter(preprocess_text(text).split())
    return [vector[word] for word in vocabulary]

def read_csv(file_path):
    dataset = []
    with open(file_path, 'r') as csvfile:
        for line in csvfile:
            entry = line.strip().split(',')
            dataset.append(entry)
    return dataset



def write_csv(file_path, data, headers):
    with open(file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(headers)
        for entry in data:
            csv_writer.writerow(entry)

# Read data from CSV file
csv_file_path = 'tokenized_dataset.csv'  # Replace with the path to your CSV file
dataset = read_csv(csv_file_path)

# Extract diseases and symptoms separately
diseases = [entry[0] for entry in dataset]
symptoms = [" ".join(entry[1:]) for entry in dataset]

# Create vocabulary for diseases and symptoms
disease_vocabulary = create_vocabulary(diseases)
symptom_vocabulary = create_vocabulary(symptoms)

# Vectorize diseases and symptoms
vectorized_diseases = [vectorize_text(disease, disease_vocabulary) for disease in diseases]
vectorized_symptoms = [vectorize_text(symptom, symptom_vocabulary) for symptom in symptoms]

# Combine vectors and write to output CSV file
output_headers = ["Disease"] + list(symptom_vocabulary)
output_dataset = [[disease] + symptom_vector for disease, symptom_vector in zip(diseases, vectorized_symptoms)]
output_file_path = 'vectorized.csv'  # Replace with the desired output file path
write_csv(output_file_path, output_dataset, output_headers)

print(f"Vectorized dataset has been written to: {output_file_path}")


Vectorized dataset has been written to: vectorized.csv


In [22]:
# vectorizing user input

import re
from collections import Counter

def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    return text

def vectorize_text(text, vocabulary):
    vector = Counter(preprocess_text(text).split())
    return [vector[word] for word in vocabulary]

# Assuming you already have disease_vocabulary and symptom_vocabulary from previous code

# Take user input
user_input = "i am suffering from fever,cold,running nose and headache"

# Vectorize user input
vectorized_user_input = vectorize_text(user_input, symptom_vocabulary)

# Print the vectorized user input
print("Vectorized User Input:", vectorized_user_input)


Vectorized User Input: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [2]:
pip install gensim

^C
Note: you may need to restart the kernel to use updated packages.


In [34]:
'''import csv

# Load your preprocessed and lemmatized text data
with open('preprocessed.csv', 'r') as input_file:
    reader = csv.reader(input_file)
    text_data = [row[0] for row in reader]

# Define a function to generate n-grams
def generate_ngrams(text, n):
    words = text.split()
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = ' '.join(words[i:i + n])
        ngrams.append(ngram)
    return ngrams

# Generate and store n-grams for your text data
n = 2  # You can choose the value of 'n' for your desired n-grams (e.g., 2 for bigrams)
ngram_data = []
for text in text_data:
    ngrams = generate_ngrams(text, n)
    ngram_data.append(ngrams)

# At this point, 'ngram_data' contains the n-grams for each text in your dataset.

# Now, you can use these n-grams for further analysis, such as creating phrase embeddings.'''

import csv
import re

def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())
    return text

def generate_ngrams(tokens, n):
    ngrams_list = []
    for i in range(len(tokens) - n + 1):
        ngrams_list.append(tokens[i:i+n])
    return ngrams_list

def process_csv(input_file, output_file, n):
    with open(input_file, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        header = next(csv_reader)

        # Find the column containing text data dynamically
        text_column_index = None
        for index, col in enumerate(header):
            if 'description' in col.lower() or 'symptoms' in col.lower():
                text_column_index = index
                break

        if text_column_index is None:
            raise ValueError("Text column not found in CSV file.")

        # Read and process the CSV file
        data = []
        for row in csv_reader:
            text = row[text_column_index]
            tokens = preprocess_text(text).split()
            ngrams_list = generate_ngrams(tokens, n)
            data.append({'text': text, 'ngrams': ngrams_list})

    # Write the results to an output CSV file
    with open(output_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['text', f'ngrams_{n}'])

        for entry in data:
            csv_writer.writerow([entry['text'], entry['ngrams']])


# Example usage
input_csv_file = 'tokenized_dataset.csv'  # Replace with the path to your CSV file
output_csv_file = 'output_ngrams.csv'  # Replace with the desired output file path
n_value = 2  # Change this to the desired n-gram value

process_csv(input_csv_file, output_csv_file, n_value)


In [61]:
import csv
import collections

class NaiveBayesClassifier:

    def __init__(self, training_data):
        self.class_probabilities = collections.defaultdict(int)
        self.feature_probabilities = collections.defaultdict(lambda: collections.defaultdict(int))

        for row in training_data:
            intent = row[0]  # Assuming the disease is in the first column
            self.class_probabilities[intent] += 1

            for i in range(1, len(row)):
                word = row[i]
                self.feature_probabilities[intent][word] += 1

    @classmethod
    def from_csv(cls, csv_file):
        training_data = []
        with open(csv_file, 'r') as csvfile:
            csv_reader = csv.reader(csvfile)
            for row in csv_reader:
                training_data.append(row)
        return cls(training_data)

    def classify_symptoms(self, symptoms):
        probabilities = {}

        for intent in self.class_probabilities:
            # Calculate the prior probability of the intent
            prior_probability = self.class_probabilities[intent] / len(self.class_probabilities)

            # Calculate the likelihood of the symptoms given the intent
            likelihood = 1.0
            for i in range(1, len(symptoms) + 1):
                word = symptoms[i - 1]
                word_count = self.feature_probabilities[intent].get(word, 0) + 1
                total_count = sum(self.feature_probabilities[intent].values()) + len(self.feature_probabilities)
                word_probability = word_count / total_count
                likelihood *= word_probability

            # Multiply the prior and likelihood to get the overall probability of the intent
            probability = prior_probability * likelihood

            probabilities[intent] = probability

        # Return the intent with the highest probability
        predicted_intent = max(probabilities, key=probabilities.get)
        return predicted_intent

# Example usage
csv_file_path = 'tokenized_dataset.csv'  # Replace with the path to your CSV file
classifier = NaiveBayesClassifier.from_csv(csv_file_path)
symptoms = ["itching", "skin_rash"]  # Replace with the symptoms you want to classify
predicted_disease = classifier.classify_symptoms(symptoms)

print(f"Symptoms: {symptoms}")
print(f"Predicted Disease: {predicted_disease}")


Symptoms: ['itching', 'skin_rash']
Predicted Disease: migraine


In [1]:
import re
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the medical dataset
# Replace 'processed_dataset.csv' with the path to your dataset
df = pd.read_csv('./data/tokenized_dataset.csv')

# Text preprocessing
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", str(text).lower())
    return text

# Apply preprocessing to the symptoms column
df['[\'symptoms\']']= df['[\'symptoms\']'].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['[\'symptoms\']'])  # Change to 'symptoms'

# Function to load medical entity rules from a CSV file
def load_rules_from_csv(csv_file):
    rules = {}
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header row
        for row in reader:
            entity_type = row[0]
            pattern = ','.join(row[1:])
            rules[entity_type] = re.compile(pattern)
    return rules

# Function to extract medical entities from text data using a rule-based approach
def extract_medical_entities(text_data, rules):
    # Convert the text data to a string
    text_data = str(text_data)

    # Initialize medical_entities list
    medical_entities = []

    # Extract medical entities from the text data
    for rule_name, rule in rules.items():
        for match in rule.finditer(text_data):
            medical_entities.append(f"{rule_name}: {match.group()}")

    # Return the medical entities
    return medical_entities

# Load entity rules from a CSV file
entity_rules = load_rules_from_csv('data/vectorized_dataset.csv')  # Replace with your CSV file path

# User input
user_input = "i am suffering from fever, cold, running nose, and headache"

# Vectorize user input using the same vectorizer used for symptoms
vectorized_user_input = tfidf_vectorizer.transform([preprocess_text(user_input)])

# Calculate cosine similarity between the user input and each row in the DataFrame
cosine_similarities = cosine_similarity(vectorized_user_input, tfidf_matrix)
most_similar_index = cosine_similarities[0].argsort()[-1]
most_similar_row = df.iloc[most_similar_index]['[\'symptoms\']']

# Extract medical entities from the most similar row
most_similar_row_entities = extract_medical_entities(most_similar_row, entity_rules)

# Print the medical entities
print("Medical Entities in Most Similar Row:")
print(most_similar_row_entities)

Medical Entities in Most Similar Row:
[]


In [75]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

class DialogueState:
    def __init__(self):
        self.current_goal = None
        self.identified_entities = {}
        self.previous_utterances = []

    def set_current_goal(self, goal):
        self.current_goal = goal

    def add_identified_entity(self, entity_type, entity_value):
        self.identified_entities[entity_type] = entity_value

    def add_previous_utterance(self, utterance):
        self.previous_utterances.append(utterance)

    def get_current_goal(self):
        return self.current_goal

    def get_identified_entities(self):
        return self.identified_entities

    def get_previous_utterances(self):
        return self.previous_utterances

# Load the vectorized dataset
df = pd.read_csv('vectorized.csv')

# Text preprocessing
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", str(text).lower())
    return text

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['[\'symptoms\']'])

def identify_disease(symptoms):
    # Vectorize user input using the same vectorizer used for symptoms
    vectorized_user_input = tfidf_vectorizer.transform([preprocess_text(symptoms)])

    # Calculate cosine similarity between the user input and each row in the DataFrame
    cosine_similarities = cosine_similarity(vectorized_user_input, tfidf_matrix)
    most_similar_index = cosine_similarities[0].argsort()[-1]
    identified_disease = df.iloc[most_similar_index]['disease']

    return identified_disease

# Example usage
dialogue_state = DialogueState()

# User input: Symptoms
user_symptoms = "fever, cough, headache"

# Identify disease
identified_disease = identify_disease(user_symptoms)

# Set the user's current goal
dialogue_state.set_current_goal("identify_disease")

# Add the identified disease to the dialogue state
dialogue_state.add_identified_entity("disease", identified_disease)

# Generate a response based on the user's goal and the identified disease
response = f"The identified disease based on symptoms is: {identified_disease}"

# Add the response to the dialogue state
dialogue_state.add_previous_utterance(response)

# Print the response to the user
print(response)

# Now, you can use the identified disease to look up medication information in your list
# For simplicity, let's assume you have a dictionary mapping diseases to medications
medications_for_diseases = {
    "fungal infection": "Antifungal medication",
    "Acne": "Topical creams",
    # Add more mappings as needed
}

# Get the identified disease
identified_disease = dialogue_state.get_identified_entities()["disease"]

# Lookup medication for the disease
medication_for_disease = medications_for_diseases.get(identified_disease, "Medication information not available")

# Generate a response with medication information
medication_response = f"The medication for {identified_disease} is: {medication_for_disease}"

# Add the medication response to the dialogue state
dialogue_state.add_previous_utterance(medication_response)

# Print the medication response to the user
print(medication_response)


AttributeError: 'int' object has no attribute 'lower'

In [1]:
import re
import spacy  # Make sure to install the 'spacy' library

class DialogueState:
    def __init__(self):
        self.current_goal = None
        self.identified_entities = {}
        self.previous_utterances = []

    def set_current_goal(self, goal):
        self.current_goal = goal

    def add_identified_entity(self, entity_type, entity_value):
        self.identified_entities[entity_type] = entity_value

    def add_previous_utterance(self, utterance):
        self.previous_utterances.append(utterance)

    def get_current_goal(self):
        return self.current_goal

    def get_identified_entities(self):
        return self.identified_entities

    def get_previous_utterances(self):
        return self.previous_utterances

class DialogueFSM:
    def __init__(self):
        # Define the states of the FSM.
        self.states = {
            "start": StartState(),
            "transition_to_next_state": TransitionToNextState(),
            "identify_disease": IdentifyDiseaseState(),
            "provide_medication": ProvideMedicationState(),
            "end": EndState()
        }

        # Set the current state.
        self.current_state = "start"
        self.dialogue_state = DialogueState()

    def transition(self, input_utterance):
        # Get the next state based on the current state and the input utterance.
        next_state = self.states[self.current_state].get_next_state(input_utterance)

        # Set the current state to the next state.
        self.current_state = next_state

        # Execute actions for the current state.
        self.states[self.current_state].execute_actions(input_utterance, self.dialogue_state)


class StartState:
    def get_next_state(self, input_utterance):
        # Always transition to an intermediate state before moving to the actual state.
        return "transition_to_next_state"

    def execute_actions(self, input_utterance, dialogue_state):
        # Greet the user and provide instructions on how to use the system.
        print("Welcome to the medical chatbot. I can help you find information about symptoms, diseases, and medications.")
        print("To get started, please tell me about your symptoms.")

class TransitionToNextState:
    def get_next_state(self, input_utterance):
        # Add logic here to determine the next state based on the input.
        # For example, if "symptom" is in the input, transition to "identify_disease".
        if "symptom" in input_utterance:
            return "identify_disease"
        # If the input doesn't contain symptoms and it's not a specific command, transition to "start".
        elif not any(cmd in input_utterance.lower() for cmd in ["medication", "disease"]):
            return "start"
        # Add more conditions as needed.
        else:
            return "transition_to_next_state"

    def execute_actions(self, input_utterance, dialogue_state):
        # No actions needed for this state.
        pass


class IdentifyDiseaseState:
    def get_next_state(self, input_utterance):
        # After identifying the disease, transition to the provide_medication state.
        return "provide_medication"

    def execute_actions(self, input_utterance, dialogue_state):
        # Extract symptoms from the user's input
        symptoms = extract_symptoms(input_utterance)

        # Identify the disease based on symptoms (you might use a more advanced model for this).
        identified_disease = identify_disease(symptoms)

        # Add the identified disease and symptoms to the dialogue state.
        dialogue_state.add_identified_entity("disease", identified_disease)
        dialogue_state.add_identified_entity("symptoms", symptoms)

        # Generate a response based on the identified disease.
        response = f"I believe you may have {identified_disease} based on the symptoms: {', '.join(symptoms)}. Let me provide information about the medication."

        # Add the response to the dialogue state.
        dialogue_state.add_previous_utterance(response)

        # Print the response to the user.
        print(response)

class ProvideMedicationState:
    def get_next_state(self, input_utterance):
        # Stay in the provide_medication state.
        return "provide_medication"

    def execute_actions(self, input_utterance, dialogue_state):
        # Get the identified disease from the dialogue state.
        identified_disease = dialogue_state.get_identified_entities().get("disease", "unknown disease")

        # Lookup medication for the disease (you might have a more sophisticated lookup mechanism).
        medication_for_disease = lookup_medication(identified_disease)

        # Generate a response with medication information.
        medication_response = f"The medication for {identified_disease} is: {medication_for_disease}"

        # Add the medication response to the dialogue state.
        dialogue_state.add_previous_utterance(medication_response)

        # Print the medication response to the user.
        print(medication_response)

class EndState:
    def get_next_state(self, input_utterance):
        # Stay in the end state.
        return "end"

    def execute_actions(self, input_utterance, dialogue_state):
        # Provide a closing message.
        print("Thank you for using the medical chatbot. If you have more questions, feel free to ask.")

def identify_disease(symptoms):
    # Placeholder function for identifying disease based on symptoms.
    # In a real application, you might use a machine learning model or a database for this.
    return "ExampleDisease"

def lookup_medication(disease):
    # Placeholder function for looking up medication based on the disease.
    # In a real application, you might have a more sophisticated medication database.
    return "ExampleMedication"

# Add a new function to extract symptoms from user input using spaCy
def extract_symptoms(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    symptoms = [ent.text for ent in doc.ents if ent.label_ == "SYMPTOM"]
    return symptoms

# Create a DialogueFSM object.
dialog_fsm = DialogueFSM()

# Process the user's input utterance.
user_utterance_1 = "I have a headache and fever."
dialog_fsm.transition(user_utterance_1)

# Process the user's next input utterance.
user_utterance_2 = "What could be the disease?"
dialog_fsm.transition(user_utterance_2)

# Process the user's next input utterance.
user_utterance_3 = "Tell me about the medication for this disease."
dialog_fsm.transition(user_utterance_3)


In [3]:
pip install Jinja2==2.11.3


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [81]:
class IdentifyDiseaseState:
    def get_next_state(self, input_utterance):
        # After identifying the disease, transition to the provide_medication state.
        return "provide_medication"

    def execute_actions(self, input_utterance, dialogue_state):
        # Identify the disease based on symptoms (you might use a more advanced model for this).
        identified_disease = identify_disease(input_utterance)

        # Add the identified disease to the dialogue state.
        dialogue_state.add_identified_entity("disease", identified_disease)

        # Generate a response based on the identified disease.
        response = f"I believe you may have {identified_disease}. Let me provide information about the medication."

        # Add the response to the dialogue state.
        dialogue_state.add_previous_utterance(response)

        # Print the response to the user.
        print(response)


class ProvideMedicationState:
    def get_next_state(self, input_utterance):
        # Stay in the provide_medication state.
        return "provide_medication"

    def execute_actions(self, input_utterance, dialogue_state):
        # Get the identified disease from the dialogue state.
        identified_disease = dialogue_state.get_identified_entities().get("disease", "unknown disease")

        # Lookup medication for the disease (you might have a more sophisticated lookup mechanism).
        medication_for_disease = lookup_medication(identified_disease)

        # Generate a response with medication information.
        medication_response = f"The medication for {identified_disease} is: {medication_for_disease}"

        # Add the medication response to the dialogue state.
        dialogue_state.add_previous_utterance(medication_response)

        # Print the medication response to the user.
        print(medication_response)


class ProvideSymptomInformationState:
    def get_next_state(self, input_utterance):
        # Stay in the provide_symptom_information state.
        return "provide_symptom_information"

    def execute_actions(self, input_utterance, dialogue_state):
        # Get information about the symptoms (you might have a more sophisticated lookup mechanism).
        symptom_information = lookup_symptom_information(input_utterance)

        # Generate a response with symptom information.
        symptom_response = f"Here is information about the symptoms: {symptom_information}"

        # Add the symptom response to the dialogue state.
        dialogue_state.add_previous_utterance(symptom_response)

        # Print the symptom response to the user.
        print(symptom_response)


# Placeholder function for looking up symptom information.
def lookup_symptom_information(symptoms):
    # In a real application, you might have a more sophisticated symptom information database.
    return "Example symptom information"

# Continue the code with the existing classes and functions

# Example usage
dialog_fsm = DialogueFSM()

# Process the user's input utterance.
user_utterance_1 = "I have a headache and fever."
dialog_fsm.transition(user_utterance_1)

# Process the user's next input utterance.
user_utterance_2 = "What could be the disease?"
dialog_fsm.transition(user_utterance_2)

# Process the user's next input utterance.
user_utterance_3 = "Tell me about the medication for this disease."
dialog_fsm.transition(user_utterance_3)

# Process the user's next input utterance.
user_utterance_4 = "What are the symptoms of the disease?"
dialog_fsm.transition(user_utterance_4)

# Note: You can continue to extend the chatbot by adding more states and functionalities as needed.


Welcome to the medical chatbot. I can help you find information about symptoms, diseases, and medications.
To get started, please tell me about your symptoms.
I believe you may have ExampleDisease. Let me provide information about the medication.


In [80]:
def generate_nlm_response(basic_response):

  """Generates a natural language response using a neural language model.

  Args:
    basic_response: The basic response generated by the template.

  Returns:
    A refined response generated by the neural language model.
  """

  # TODO: Implement this function to generate a response using a neural language model.

  return basic_response

# Get the basic response from the template.
basic_response = "The symptoms of a heart attack can vary from person to person, but they often include chest pain, shortness of breath, nausea, and vomiting."

# Use the NLM to refine the response.
nlm_response = generate_nlm_response(basic_response)

# Combine the basic and NLM responses.
final_response = "{} If you are experiencing any of these symptoms, it is important to seek medical attention immediately.".format(nlm_response)

# Print the response to the user.
print(final_response)


The symptoms of a heart attack can vary from person to person, but they often include chest pain, shortness of breath, nausea, and vomiting. If you are experiencing any of these symptoms, it is important to seek medical attention immediately.


In [72]:
def calculate_intent_classification_accuracy(true_intents, predicted_intents):

  """Calculates the intent classification accuracy.

  Args:
    true_intents: A list of the true intents.
    predicted_intents: A list of the predicted intents.

  Returns:
    The intent classification accuracy.
  """

  correct_predictions = 0
  for i in range(len(true_intents)):
    if true_intents[i] == predicted_intents[i]:
      correct_predictions += 1

  return correct_predictions / len(true_intents)

def calculate_entity_recognition_f1_score(true_entities, predicted_entities):

  """Calculates the entity recognition F1-score.

  Args:
    true_entities: A list of the true entities.
    predicted_entities: A list of the predicted entities.

  Returns:
    The entity recognition F1-score.
  """

  from sklearn.metrics import f1_score

  return f1_score(true_entities, predicted_entities, average="micro")

def calculate_question_answering_accuracy(true_answers, predicted_answers):

  """Calculates the question answering accuracy.

  Args:
    true_answers: A list of the true answers.
    predicted_answers: A list of the predicted answers.

  Returns:
    The question answering accuracy.
  """

  correct_predictions = 0
  for i in range(len(true_answers)):
    if true_answers[i] == predicted_answers[i]:
      correct_predictions += 1

  return correct_predictions / len(true_answers)
