In [5]:
# Import necessary libraries
import numpy as np
import pandas as pd
import random
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("/content/Pancreatic chatbot.csv")
df = df.dropna().drop_duplicates()
df['Question'] = df['Question'].str.lower()

# Function to remove punctuation
def remove_punct(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Apply punctuation removal
df['Question'] = df['Question'].apply(remove_punct)

# Remove stopwords
stop = set(stopwords.words('english'))
def remove_stopword(text):
    word_tokens = word_tokenize(text)
    return ' '.join([w for w in word_tokens if w.lower() not in stop])

df['Question'] = df['Question'].apply(remove_stopword)

# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemm(text):
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

df['Question'] = df['Question'].apply(lemm)

# Corpus creation
corpus = df['Question'].values

# Bag of Words Vectorization
bw_vect = CountVectorizer()
bw_fit = bw_vect.fit(corpus)
bw_corpus = bw_fit.transform(corpus)
cv_data = pd.DataFrame(bw_corpus.toarray(), columns=bw_fit.get_feature_names_out())

# TF-IDF Vectorization
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_fit = tfidf_vect.fit(corpus)
tfidf_corpus = tfidf_fit.transform(corpus)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [8]:

# Additional data cleaning function for chatbot input
def clean_data(text):
    # Remove unwanted characters
    text = re.sub(r"[\([{})\]]", " ", text)
    # Tokenize and lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Define greeting responses
welcome_responses = ['hi', 'hey', 'hello', 'greetings']

# Chatbot response function
def chatbot_response(user_input):
    # Greeting response
    if any(word.lower() in welcome_responses for word in user_input.split()):
        return random.choice(welcome_responses)

    # Clean user input
    user_input = clean_data(user_input)
    # Transform input question using TF-IDF
    tfidf_test = tfidf_fit.transform([user_input])
    # Calculate cosine similarity between user input and corpus
    cosine_similarities = cosine_similarity(tfidf_test, tfidf_corpus).flatten()

    # Find the index of the most similar question
    highest_similarity_index = cosine_similarities.argmax()
    if cosine_similarities[highest_similarity_index] == 0:
        return "I'm sorry, I don't have an answer for that."

    # Retrieve the answer
    return df.iloc[highest_similarity_index]['Answer']




In [9]:
# Example usage
sample_question = "How is pancreatic cancer diagnosed?"
response = chatbot_response(sample_question)
print(response)

Diagnosis includes imaging tests like CT, MRI, and PET scans, along with biopsy and tumor marker tests.


In [10]:
import joblib

# Save TF-IDF vectorizer
joblib.dump(tfidf_fit, 'tfidf_vectorizer.pkl')

# Save the TF-IDF corpus matrix
joblib.dump(tfidf_corpus, 'tfidf_corpus.pkl')

# Save the dataframe with questions and answers
df.to_csv('questions_answers.csv', index=False)

In [13]:
import joblib
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import random
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# Load the saved TF-IDF vectorizer, TF-IDF corpus, and dataframe
tfidf_fit = joblib.load('tfidf_vectorizer.pkl')
tfidf_corpus = joblib.load('tfidf_corpus.pkl')
df = pd.read_csv('questions_answers.csv')

# Redefine welcome responses and chatbot response function
welcome_responses = ['hi', 'hey', 'hello', 'greetings']
def clean_data(text):
    # Remove unwanted characters
    text = re.sub(r"[\([{})\]]", " ", text)
    # Tokenize and lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)
def chatbot_response(user_input):
    # Greeting response
    if any(word.lower() in welcome_responses for word in user_input.split()):
        return random.choice(welcome_responses)

    # Clean user input
    user_input = clean_data(user_input)
    # Transform input question using TF-IDF
    tfidf_test = tfidf_fit.transform([user_input])
    # Calculate cosine similarity between user input and corpus
    cosine_similarities = cosine_similarity(tfidf_test, tfidf_corpus).flatten()

    # Find the index of the most similar question
    highest_similarity_index = cosine_similarities.argmax()
    if cosine_similarities[highest_similarity_index] == 0:
        return "I'm sorry, I don't have an answer for that."

    # Retrieve the answer
    return df.iloc[highest_similarity_index]['Answer']

# Example usage
sample_question = "How is pancreatic cancer diagnosed?"
response = chatbot_response(sample_question)
print(response)


Diagnosis includes imaging tests like CT, MRI, and PET scans, along with biopsy and tumor marker tests.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
