# Installations

In [None]:
!sudo apt-get install portaudio19-dev #Often required for PyAudio.
!pip install SpeechRecognition pydub PyAudio

# Speech Recognition

### Listens to the user, and then recognizes the stuffs to store in a file.

In [None]:
import speech_recognition as sr
import os
import sys

def beep_begin():
    if sys.platform == "win32": # If system is Windows
        import winsound
        frequency = 1000  # Set Frequency To 1000 Hertz
        duration = 500    # Set Duration To 500 ms == 0.5 second
        winsound.Beep(frequency, duration)
    else:
        os.system('echo -e "\a"') # If system is Linux/Mac, give a terminal beep.

def beep_end():
    if sys.platform == "win32": # Same thing as above
        import winsound
        frequency = 4000  # Set Frequency To 4000 Hertz now
        duration = 500
        winsound.Beep(frequency, duration)
    else:
        os.system('echo -e "\a"')

def save_text_to_file(text, filename="transcription.txt"):
    with open(filename, "a") as file:
        file.write(text + "\n")

def get_speech_input():
    recognizer = sr.Recognizer()
    mic = sr.Microphone()

    beep_begin()  # Beep before listening
    print("Listening...")

    with mic as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    beep_end()  # Beep after listening

    try:
        text = recognizer.recognize_google(audio)
        print(f"You said: {text}")
        save_text_to_file(text) # The text is stored in file named transcription.txt
        return text
    except sr.UnknownValueError:
        print("Could not understand the audio.")
    except sr.RequestError as e:
        print(f"Error: {e}")

get_speech_input()

# NLP Tasks

Reading the file

In [110]:
#Read the file
with open("transcription.txt", "r") as file:
    text = file.read()

# Empty the file after read
with open("transcription.txt", "w") as file:
    file.write('')

Getting the list of symptoms so we can basically handle multiword stuffs.

In [None]:
import pandas as pd

df = pd.read_csv('dataset.csv')

# Select all columns except the first one (assuming 'Disease' is the first column in my case ).
# Basically, Select the columns with symptoms
symptom_columns = df.columns[1:]

symptoms = pd.concat([df[col] for col in symptom_columns]).unique()

symptoms = [s.replace('_', ' ') if isinstance(s, str) else s for s in symptoms]

symptoms = [s for s in symptoms if isinstance(s, str)]

symptoms = [s.strip() for s in symptoms]

print(symptoms)

Now we do tokenization !!!

In [None]:
from transformers import BertTokenizer
import re

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def custom_tokenize(text, symptoms, tokenizer):
    # Prepare regex patterns to match symptoms
    pattern = re.compile('|'.join([re.escape(symptom) for symptom in symptoms]), re.IGNORECASE)

    # Replace multi-word symptoms with placeholders
    tokens = []
    for match in pattern.finditer(text):
        start, end = match.span()
        tokens.append(text[start:end])

    # Tokenize the rest of the text
    remaining_text = pattern.sub('', text)
    remaining_tokens = tokenizer.tokenize(remaining_text)

    return tokens + remaining_tokens

tokens = custom_tokenize(text, symptoms, tokenizer)

tokens = [token for token in tokens if token not in string.punctuation]
print(tokens)

def replace_spaces_with_underscores(tokens):
    return [token.replace(' ', '_') for token in tokens]

# Replace spaces with underscores once again.
tokens_with_underscores = replace_spaces_with_underscores(tokens)

print(tokens_with_underscores)


# Disease Identification

In [None]:
import pandas as pd

df = pd.read_csv('dataset.csv')

def find_highest_probability_disease(df, symptoms):
    disease_match_count = {}

    for index, row in df.iterrows():
        disease = row['Disease']
        disease_symptoms = set([str(row[col]).strip().lower() for col in df.columns if col != 'Disease' and pd.notna(row[col])])

        # Calculate the number of matching symptoms
        matching_symptoms = symptoms.intersection(disease_symptoms)
        match_count = len(matching_symptoms)

        if match_count > 0:
            disease_match_count[disease] = (match_count, matching_symptoms)

    sorted_diseases = sorted(disease_match_count.items(), key=lambda item: item[1][0], reverse=True)

    return sorted_diseases

# Convert tokens to a set of symptoms for comparison
symptoms_set = set(tokens)

matching_diseases = find_highest_probability_disease(df, symptoms_set)

if matching_diseases:
    for disease, (count, relevant_symptoms) in matching_diseases:
        print(f"Disease: {disease}, Matching Symptoms: {count}")

    top_disease, (top_count, top_relevant_symptoms) = matching_diseases[0]
    top_relevant_symptoms_str = ', '.join(top_relevant_symptoms)

    print(f"\nDisease with highest matching symptoms: {top_disease}")
    print(f"Matching Symptoms: {top_relevant_symptoms_str}")

    if len(matching_diseases) > 3:
        print("\n\nA lot of diseases are identified for further checkups.\nTests are preferred or doctor consultation is preferred.")
else:
    print("No matching disease found. Proceed to doing tests instead of symptomatic treatment.")