In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\panna\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\panna\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\panna\AppData\Roaming\nltk_data...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
faq_data = {
    "What is your return policy?": "Our return policy allows returns within 30 days of purchase.",
    "How long does shipping take?": "Shipping typically takes 3-5 business days.",
    "What payment methods do you accept?": "We accept all major credit cards and PayPal."
}

# Fit and transform FAQs
faq_questions = list(faq_data.keys())
faq_tfidf = tfidf_vectorizer.fit_transform(faq_questions)


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def get_best_faq_match(user_input, faq_tfidf, faq_questions):
    user_tfidf = tfidf_vectorizer.transform([user_input])
    similarities = cosine_similarity(user_tfidf, faq_tfidf)
    best_match_idx = similarities.argmax()
    return faq_questions[best_match_idx]

user_input = "How long does shipping take?"
best_faq = get_best_faq_match(user_input, faq_tfidf, faq_questions)
print(faq_data[best_faq])


Shipping typically takes 3-5 business days.


In [5]:
pip install transformers

Collecting transformersNote: you may need to restart the kernel to use updated packages.

  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ----- ---------------------------------- 1.3/9.9 MB 6.7 MB/s eta 0:00:02
   ---------- ----------------------------- 2.6/9.9 MB 7.2 MB/s eta 0:00:02
   ------------------ --------------------- 4.5/9.9 MB 7.7 MB/s eta 0:00:01
   ------------------------ --------------- 6.0/9.9 MB 7.5 MB/s eta 0:00:01
   -----------------------

In [7]:
pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.17.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.17.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------------ --------- 1.3/1.7 MB 7.4 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 7.2 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.17.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
from transformers import pipeline

# Load a pre-trained BERT model for question answering
question_answering = pipeline("question-answering")

def bert_faq_response(user_input, faq_data):
    context = " ".join(faq_data.values())  # Concatenate all FAQ answers as context
    response = question_answering(question=user_input, context=context)
    return response['answer']

response = bert_faq_response("How long does shipping take?", faq_data)
print(response)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.





model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

3-5 business days


In [9]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score

In [10]:
faq_data = {
    "What is your return policy?": "Our return policy allows returns within 30 days of purchase.",
    "How long does shipping take?": "Shipping typically takes 3-5 business days.",
    "What payment methods do you accept?": "We accept all major credit cards and PayPal.",
    "Do you ship internationally?": "Yes, we ship to over 200 countries worldwide.",
    "Can I track my order?": "Yes, once your order is shipped, we will provide you with a tracking number.",
    "What is your returns policy?": "Our return policy allows returns within 30 days of purchase.",  # Duplicate/variant
    "How fast is shipping?": "Shipping typically takes 3-5 business days.",  # Duplicate/variant
    "Which cards do you accept?": "We accept all major credit cards and PayPal.",  # Duplicate/variant
}

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data
faq_questions = list(faq_data.keys())  # Questions are the input
faq_answers = list(faq_data.values())  # Answers are the labels

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(faq_questions, faq_answers, test_size=0.2, random_state=42)

# Convert text data to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Logistic Regression classifier
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
                                                               precision    recall  f1-score   support

Our return policy allows returns within 30 days of purchase.       1.00      1.00      1.00         1
                 Shipping typically takes 3-5 business days.       1.00      1.00      1.00         1

                                                    accuracy                           1.00         2
                                                   macro avg       1.00      1.00      1.00         2
                                                weighted avg       1.00      1.00      1.00         2



In [15]:
import nbformat

In [19]:
notebook_path = 'Chatbot.ipynb'

In [21]:
with open(notebook_path, 'r', encoding='utf-8') as file:
    notebook = nbformat.read(file, as_version=4)

In [23]:
notebook.cells[:3]  

[{'cell_type': 'code',
  'execution_count': 1,
  'metadata': {'colab': {'base_uri': 'https://localhost:8080/'},
   'id': 'eEQT0C5Q7YtR',
   'outputId': '6f442bf1-e8e6-432d-85ca-dc320be52c8f'},
  'outputs': [{'name': 'stderr',
    'output_type': 'stream',
    'text': '[nltk_data] Downloading package punkt to\n[nltk_data]     C:\\Users\\panna\\AppData\\Roaming\\nltk_data...\n[nltk_data]   Unzipping tokenizers\\punkt.zip.\n[nltk_data] Downloading package stopwords to\n[nltk_data]     C:\\Users\\panna\\AppData\\Roaming\\nltk_data...\n[nltk_data]   Unzipping corpora\\stopwords.zip.\n[nltk_data] Downloading package wordnet to\n[nltk_data]     C:\\Users\\panna\\AppData\\Roaming\\nltk_data...\n'}],
  'source': "import nltk\nfrom nltk.corpus import stopwords\nfrom nltk.tokenize import word_tokenize\nfrom nltk.stem import WordNetLemmatizer\n\nnltk.download('punkt')\nnltk.download('stopwords')\nnltk.download('wordnet')\n\nstop_words = set(stopwords.words('english'))\nlemmatizer = WordNetLemmatize

In [25]:
conversation_context = {}

In [43]:
def get_context(user_id):
    """Retrieve user context if available"""
    return conversation_context.get(user_id, None)

def update_context(user_id, question):
    """Update the context with the latest user question"""
    conversation_context[user_id] = question

def get_best_faq_match(user_input, faq_tfidf, faq_questions, user_id=None):
    """Find the best FAQ match with context memory"""
    if user_id:
        context = get_context(user_id)
        if context:
            user_input = context + " " + user_input
    user_tfidf = tfidf_vectorizer.transform([user_input])
    similarities = cosine_similarity(user_tfidf, faq_tfidf)
    best_match_idx = similarities.argmax()

    if user_id:
        update_context(user_id, user_input)  
    
    return faq_questions[best_match_idx]

In [45]:
from nltk.corpus import wordnet

def get_synonyms(word):
    """Return a list of synonyms for a given word using WordNet"""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def preprocess_text_with_synonyms(text):
    """Preprocess text and replace words with their synonyms"""
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    
    
    tokens_with_synonyms = []
    for token in tokens:
        synonyms = get_synonyms(token)
        if synonyms:
            
            tokens_with_synonyms.append(list(synonyms)[0])
        else:
            tokens_with_synonyms.append(token)
    
    return tokens_with_synonyms

In [47]:
pip install textblob

Collecting textblobNote: you may need to restart the kernel to use updated packages.

  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 626.3/626.3 kB 3.9 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [60]:
from textblob import TextBlob

def correct_spelling(text):
    """Correct spelling in the input text using TextBlob"""
    return str(TextBlob(text).correct())

def preprocess_text_with_correction(text):
    """Preprocess text with spell correction"""
    corrected_text = correct_spelling(text)
    tokens = word_tokenize(corrected_text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

In [62]:
def handle_multi_intents(user_input):
    """Split and handle multiple intents in the user input"""
    # Split the input into multiple questions (for simplicity, use periods as delimiters)
    intents = user_input.split('.')
    responses = []
    for intent in intents:
        if intent.strip():
            best_faq = get_best_faq_match(intent.strip(), faq_tfidf, faq_questions)
            responses.append(faq_data[best_faq])
    
    return responses

In [74]:
# Ensure you're using the same TfidfVectorizer for both FAQ and user input
tfidf_vectorizer = TfidfVectorizer()

# Train on FAQ questions
faq_questions = list(faq_data.keys())
faq_tfidf = tfidf_vectorizer.fit_transform(faq_questions)

# When a user asks a question, use the same vectorizer to transform their input
def get_best_faq_match(user_input, faq_tfidf, faq_questions):
    # Transform the user input using the same vectorizer
    user_tfidf = tfidf_vectorizer.transform([user_input])
    
    # Now compute cosine similarity
    similarities = cosine_similarity(user_tfidf, faq_tfidf)
    best_match_idx = similarities.argmax()
    
    return faq_questions[best_match_idx]

# Example user input
user_input = "How long does shipping take?"
best_faq = get_best_faq_match(user_input, faq_tfidf, faq_questions)
print(faq_data[best_faq])


Shipping typically takes 3-5 business days.


In [75]:
user_input = "Can I return this item? How long does shipping take?"
responses = handle_multi_intents(user_input)
for response in responses:
    print(response)

Shipping typically takes 3-5 business days.


In [76]:
feedback_data = []

def get_user_feedback(user_input, faq_answer):
    """Ask for user feedback after providing an answer"""
    print(f"User Input: {user_input}")
    print(f"Chatbot Response: {faq_answer}")
    feedback = input("Was this answer helpful? (yes/no): ").lower()
    feedback_data.append({"user_input": user_input, "response": faq_answer, "feedback": feedback})

# Example usage
user_input = "How long does shipping take?"
best_faq = get_best_faq_match(user_input, faq_tfidf, faq_questions)
print(faq_data[best_faq])
get_user_feedback(user_input, faq_data[best_faq])

Shipping typically takes 3-5 business days.
User Input: How long does shipping take?
Chatbot Response: Shipping typically takes 3-5 business days.


Was this answer helpful? (yes/no):  yes
