In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split

# Download NLTK punkt tokenizer if you haven't already
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\msk23\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the dataset
data = pd.read_csv('train.csv')  # Adjust the path as necessary
print(data.head())  # Check the first few rows of the dataset


                                            Question  \
0  Who is at risk for Lymphocytic Choriomeningiti...   
1  What are the symptoms of Lymphocytic Choriomen...   
2  Who is at risk for Lymphocytic Choriomeningiti...   
3  How to diagnose Lymphocytic Choriomeningitis (...   
4  What are the treatments for Lymphocytic Chorio...   

                                              Answer  
0  LCMV infections can occur after exposure to fr...  
1  LCMV is most commonly recognized as causing ne...  
2  Individuals of all ages who come into contact ...  
3  During the first phase of the disease, the mos...  
4  Aseptic meningitis, encephalitis, or meningoen...  


In [4]:
# Check the column names
print(data.columns)

Index(['Question', 'Answer'], dtype='object')


In [5]:
def clean_text(text):
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [6]:
# Clean the 'Question' column
data['cleaned_question'] = data['Question'].apply(clean_text)

# Clean the 'Answer' column
data['cleaned_answer'] = data['Answer'].apply(clean_text)

# Display the cleaned questions and answers
print(data[['cleaned_question', 'cleaned_answer']].head())


                                    cleaned_question  \
0  Who is at risk for Lymphocytic Choriomeningiti...   
1  What are the symptoms of Lymphocytic Choriomen...   
2  Who is at risk for Lymphocytic Choriomeningiti...   
3   How to diagnose Lymphocytic Choriomeningitis LCM   
4  What are the treatments for Lymphocytic Chorio...   

                                      cleaned_answer  
0  LCMV infections can occur after exposure to fr...  
1  LCMV is most commonly recognized as causing ne...  
2  Individuals of all ages who come into contact ...  
3  During the first phase of the disease the most...  
4  Aseptic meningitis encephalitis or meningoence...  


In [7]:
from nltk.tokenize import word_tokenize

# Tokenize the cleaned questions and answers
data['tokens_question'] = data['cleaned_question'].apply(word_tokenize)
data['tokens_answer'] = data['cleaned_answer'].apply(word_tokenize)

# Check the tokenized questions and answers
print(data[['tokens_question', 'tokens_answer']].head())


                                     tokens_question  \
0  [Who, is, at, risk, for, Lymphocytic, Choriome...   
1  [What, are, the, symptoms, of, Lymphocytic, Ch...   
2  [Who, is, at, risk, for, Lymphocytic, Choriome...   
3  [How, to, diagnose, Lymphocytic, Choriomeningi...   
4  [What, are, the, treatments, for, Lymphocytic,...   

                                       tokens_answer  
0  [LCMV, infections, can, occur, after, exposure...  
1  [LCMV, is, most, commonly, recognized, as, cau...  
2  [Individuals, of, all, ages, who, come, into, ...  
3  [During, the, first, phase, of, the, disease, ...  
4  [Aseptic, meningitis, encephalitis, or, mening...  


In [8]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\users\msk23\flaskenv\flaskenv\lib\site-packages\pytesseract-0.3.13-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB 8.9 MB/s eta 0:00:02
     ---- ----------------------------------- 1.4/12.8 MB 14.3 MB/s eta 0:00:01
     ----- ---------------------------------- 1.8/12.8 MB 14.0 MB/s eta 0:00:01
     ------ --------------------------------- 2.0/12.8 MB 11.6 MB/s eta 0:00:01
     ------- -------------------------------- 2.3/12.8 MB 10.4 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 9.4 MB/s eta 0:00:02
     -------- ------------------------------- 2.9/12.8 MB 9.2 MB/s eta 0:00:02
     ---------- ----------------------------- 3.3/12.8 MB 9.2 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 8.3 MB/s eta 0:00:02
     ----------- --------------------

DEPRECATION: Loading egg at c:\users\msk23\flaskenv\flaskenv\lib\site-packages\pytesseract-0.3.13-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import spacy
from nltk.tokenize import word_tokenize

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Extract entities from cleaned questions and answers
data['entities_question'] = data['cleaned_question'].apply(extract_entities)
data['entities_answer'] = data['cleaned_answer'].apply(extract_entities)

# Display the extracted entities
print(data[['cleaned_question', 'entities_question', 'cleaned_answer', 'entities_answer']].head())


                                    cleaned_question  \
0  Who is at risk for Lymphocytic Choriomeningiti...   
1  What are the symptoms of Lymphocytic Choriomen...   
2  Who is at risk for Lymphocytic Choriomeningiti...   
3   How to diagnose Lymphocytic Choriomeningitis LCM   
4  What are the treatments for Lymphocytic Chorio...   

                           entities_question  \
0                      [(Lymphocytic, NORP)]   
1  [(Lymphocytic Choriomeningitis LCM, LOC)]   
2                      [(Lymphocytic, NORP)]   
3                      [(Lymphocytic, NORP)]   
4                      [(Lymphocytic, NORP)]   

                                      cleaned_answer  \
0  LCMV infections can occur after exposure to fr...   
1  LCMV is most commonly recognized as causing ne...   
2  Individuals of all ages who come into contact ...   
3  During the first phase of the disease the most...   
4  Aseptic meningitis encephalitis or meningoence...   

                                     

In [13]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [14]:
from transformers import pipeline

# Load the question-answering pipeline
qa_pipeline = pipeline("question-answering")

def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Example usage
question = "What are the symptoms of diabetes?"
context = "Diabetes is often called a silent disease because it can cause serious complications even before you have symptoms. Symptoms can also be so mild that you dont notice them. An estimated 8 million people in the United States have type 2 diabetes and dont know it, according to 2012 estimates by the Centers for Disease Control and Prevention (CDC). Common Signs Some common symptoms of diabetes are: - being very thirsty  - frequent urination  - feeling very hungry or tired  - losing weight without trying  - having sores that heal slowly  - having dry, itchy skin  - loss of feeling or tingling in the feet  - having blurry eyesight. being very thirsty frequent urination feeling very hungry or tired losing weight without trying having sores that heal slowly having dry, itchy skin loss of feeling or tingling in the feet having blurry eyesight. Signs of type 1 diabetes usually develop over a short period of time. The signs for type 2 diabetes develop more gradually. Tests for Diabetes The following tests are used to diagnose diabetes or prediabetes. - An A1C test measures your average blood glucose levels over the past 3 months. It can be used to diagnose type 2 diabetes and prediabetes. It does not require fasting and blood can be drawn for the test any time of the day.  An A1C test measures your average blood glucose levels over the past 3 months. It can be used to diagnose type 2 diabetes and prediabetes. It does not require fasting and blood can be drawn for the test any time of the day. - A fasting plasma glucose, or FPG test, measures your blood glucose after you have gone at least 8 hours without eating. Doctors use this test to detect diabetes or prediabetes.  A fasting plasma glucose, or FPG test, measures your blood glucose after you have gone at least 8 hours without eating. Doctors use this test to detect diabetes or prediabetes. - In a random plasma glucose test, your doctor checks your blood glucose without regard to when you ate your last meal. This test, along with an assessment of symptoms, is used to diagnose diabetes but not prediabetes.  In a random plasma glucose test, your doctor checks your blood glucose without regard to when you ate your last meal. This test, along with an assessment of symptoms, is used to diagnose diabetes but not prediabetes. - An oral glucose tolerance test, or OGTT, measures your blood glucose after you have gone at least 8 hours without eating and 2 hours after you drink a sweet beverage. Doctors also use the oral glucose tolerance test to diagnose gestational diabetes in pregnant women.  An oral glucose tolerance test, or OGTT, measures your blood glucose after you have gone at least 8 hours without eating and 2 hours after you drink a sweet beverage. Doctors also use the oral glucose tolerance test to diagnose gestational diabetes in pregnant women. If any of these tests show that you might have diabetes, your doctor will need to repeat the test with a second measurement unless there are clear symptoms of diabetes. Get more details about tests for diabetes. Who Should Get Tested? Because type 2 diabetes is more common in older people, anyone who is 45 or older should consider getting tested. If you are 45 or older and overweight, getting tested is strongly recommended. If you are younger than 45, overweight, and have one or more risk factors, you also should talk with your doctor about being tested. See risk factors for type 2 diabetes. Why Early Detection is Important Diabetes is a serious disease that can lead to a number of health problems such as heart disease, stroke, vision problems, kidney disease and even death. Sometimes people have symptoms but do not suspect diabetes. They delay scheduling a checkup because they do not feel sick. Many people do not find out they have the disease until they have diabetes complications, such as a heart attack or stroke. Finding out early if you have diabetes is important because treatment can prevent or delay the complications of the disease."

answer = answer_question(question, context)
print("Answer:", answer)


  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.



Answer: being very thirsty  - frequent urination


In [15]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER
def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [16]:
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER on the question
def perform_ner_on_question(question):
    doc = nlp(question)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Function to generate an answer based on the question (without explicit context)
def generate_answer_based_on_question(question):
    entities = perform_ner_on_question(question)
    
    # If any entities are found, create a relevant response
    if entities:
        entity_info = ", ".join([f"{ent[0]} ({ent[1]})" for ent in entities])
        return f"The question contains the following entities: {entity_info}. Based on this, further information can be provided."
    
    # If no entities are found, provide a default response
    return "Sorry, I couldn't find any specific information to answer the question."

# Example usage
question = "What is diabetes?"
answer = generate_answer_based_on_question(question)
print("Answer based on question:", answer)


Answer based on question: Sorry, I couldn't find any specific information to answer the question.


In [17]:
import pickle

# Assuming the processed data `data` (the DataFrame with tokenized and cleaned text) is ready to be saved

# Save the processed dataset to a pickle file
with open(r'C:\Users\msk23\OneDrive\Desktop\NIT\NLP\New folder\NLP.pkl', 'wb') as f:
    pickle.dump(data, f)

print("Processed data has been saved to a pickle file.")


Processed data has been saved to a pickle file.
