<a href="https://colab.research.google.com/github/VanshikaM03/NLP_Cleaning_the_data/blob/main/NLP_Cleaningthedata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [32]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

**Download NLTK resources**

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Getting the data**

In [12]:
df = pd.read_csv('/content/MedicalQ.csv')

In [14]:
df.head()

Unnamed: 0,qtype,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4607 entries, 0 to 4606
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   qtype     2000 non-null   object
 1   Question  2000 non-null   object
 2   Answer    2000 non-null   object
dtypes: object(3)
memory usage: 108.1+ KB


**Cleaning the Data**

In [16]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [17]:
df.qtype.unique()

array(['susceptibility', 'symptoms', 'exams and tests', 'treatment',
       'prevention', 'information', 'frequency', 'complications',
       'causes', 'research', 'outlook', 'considerations'], dtype=object)

In [18]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stop words
    text = ' '.join([word for word in text.split() if word.isalpha()])  # Remove non-alphabetic words
    return text

df['cleaned_question'] = df['Question'].apply(clean_text)
df['cleaned_answer'] = df['Answer'].apply(clean_text)

In [19]:
df.head()

Unnamed: 0,qtype,Question,Answer,cleaned_question,cleaned_answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...,risk lymphocytic choriomeningitis lcm,lcmv infections occur exposure fresh urine dro...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...,symptoms lymphocytic choriomeningitis lcm,lcmv commonly recognized causing neurological ...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...,risk lymphocytic choriomeningitis lcm,individuals ages come contact urine feces sali...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos...",diagnose lymphocytic choriomeningitis lcm,first phase disease common laboratory abnormal...
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen...",treatments lymphocytic choriomeningitis lcm,aseptic meningitis encephalitis meningoencepha...


**Converting cleaned data to csv file**

In [20]:
cleaned_df = df[['qtype', 'cleaned_question', 'cleaned_answer']].copy()

cleaned_df.to_csv('cleaned_dataset.csv', index=False)


In [25]:
cleaned_df.head()

Unnamed: 0,qtype,cleaned_question,cleaned_answer
0,susceptibility,risk lymphocytic choriomeningitis lcm,lcmv infections occur exposure fresh urine dro...
1,symptoms,symptoms lymphocytic choriomeningitis lcm,lcmv commonly recognized causing neurological ...
2,susceptibility,risk lymphocytic choriomeningitis lcm,individuals ages come contact urine feces sali...
3,exams and tests,diagnose lymphocytic choriomeningitis lcm,first phase disease common laboratory abnormal...
4,treatment,treatments lymphocytic choriomeningitis lcm,aseptic meningitis encephalitis meningoencepha...


**Tokenize and Lemmatize**

In [21]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [22]:
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)


In [24]:
df['tokenized_question'] = df['cleaned_question'].apply(tokenize_and_lemmatize)
df['tokenized_answer'] = df['cleaned_answer'].apply(tokenize_and_lemmatize)

In [26]:
tokenized_df = df[['qtype', 'tokenized_question', 'tokenized_answer']].copy()
tokenized_df

Unnamed: 0,qtype,tokenized_question,tokenized_answer
0,susceptibility,risk lymphocytic choriomeningitis lcm,lcmv infection occur exposure fresh urine drop...
1,symptoms,symptom lymphocytic choriomeningitis lcm,lcmv commonly recognized causing neurological ...
2,susceptibility,risk lymphocytic choriomeningitis lcm,individual age come contact urine feces saliva...
3,exams and tests,diagnose lymphocytic choriomeningitis lcm,first phase disease common laboratory abnormal...
4,treatment,treatment lymphocytic choriomeningitis lcm,aseptic meningitis encephalitis meningoencepha...
...,...,...,...
1995,prevention,prevent urinary retention,people prevent urinary retention occurs treati...
1996,considerations,urinary retention,researcher found eating diet nutrition play ro...
1997,considerations,urinary retention,urinary retention inability empty bladder comp...
1998,information,proteinuria,proteinuriaalso called albuminuria urine album...


**Creating a Corpus**

In [28]:
import string
import pickle
corpus = ' '.join(df['tokenized_question'] + ' ' + df['tokenized_answer'])

# Save the corpus as a pickle file
with open('corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)

In [29]:
corpus



**Creating a Document Term Matrix**

In [30]:
vectorizer = CountVectorizer()
dtm = vectorizer.fit_transform(df['tokenized_question'] + ' ' + df['tokenized_answer'])

terms = vectorizer.get_feature_names_out()

dtm_df = pd.DataFrame(dtm.toarray(), columns=terms)

In [31]:
dtm_df

Unnamed: 0,aap,ab,abandonment,abbreviation,abdomen,abdomenthe,abdomenusually,abdominal,ability,abilityand,...,zoloft,zoo,zoonosis,zoonotic,zostavax,zoster,zucchini,zyloprim,zyrtec,µgml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1968,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Additional Assignment**

In [33]:
# Create a CountVectorizer with different parameters
vectorizer_ngrams = CountVectorizer(ngram_range=(1, 2))  # Consider both unigrams and bigrams
vectorizer_min_df = CountVectorizer(min_df=2)  # Ignore terms that appear in only one document
vectorizer_max_df = CountVectorizer(max_df=0.5)  # Ignore terms that appear in more than half of the documents

# Create document-term matrices with different vectorizers
dtm_ngrams = vectorizer_ngrams.fit_transform(df['tokenized_question'] + ' ' + df['tokenized_answer'])
dtm_min_df = vectorizer_min_df.fit_transform(df['tokenized_question'] + ' ' + df['tokenized_answer'])
dtm_max_df = vectorizer_max_df.fit_transform(df['tokenized_question'] + ' ' + df['tokenized_answer'])

# Get the terms for each vectorizer
terms_ngrams = vectorizer_ngrams.get_feature_names_out()
terms_min_df = vectorizer_min_df.get_feature_names_out()
terms_max_df = vectorizer_max_df.get_feature_names_out()

# Get the document-term matrices as DataFrames
dtm_ngrams_df = pd.DataFrame(dtm_ngrams.toarray(), columns=terms_ngrams)
dtm_min_df_df = pd.DataFrame(dtm_min_df.toarray(), columns=terms_min_df)
dtm_max_df_df = pd.DataFrame(dtm_max_df.toarray(), columns=terms_max_df)

# dtm_ngrams_df, dtm_min_df_df, and dtm_max_df_df now contain the document-term matrices with different parameters


In [35]:
dtm_ngrams_df

Unnamed: 0,aap,aap committee,aap longer,aap potential,ab,ab blood,abandonment,abandonment common,abbreviation,abbreviation record,...,zoster smallpox,zoster virus,zucchini,zucchini cucumber,zyloprim,zyloprim decrease,zyrtec,zyrtec chlorpheniramine,µgml,µgml isolated
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
dtm_min_df_df

Unnamed: 0,aap,abdomen,abdomenthe,abdominal,ability,able,abnormal,abnormality,abnormally,abroad,...,zantac,zealand,zellweger,zestril,zika,zinc,zoonosis,zoonotic,zoster,µgml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1968,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1969,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
dtm_max_df_df

Unnamed: 0,aap,ab,abandonment,abbreviation,abdomen,abdomenthe,abdomenusually,abdominal,ability,abilityand,...,zoloft,zoo,zoonosis,zoonotic,zostavax,zoster,zucchini,zyloprim,zyrtec,µgml
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1967,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1968,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
