## Level 1: Rule Based ChatBot

In [1]:
# Setup & Imports

import os
import pandas as pd
import unicodedata
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load & Inspect Cleaned Data

In [2]:
# Load the cleaned CSV
df = pd.read_csv("D:/PROJECTS/Version Control/CHATBOT/building-smart-bots-from-scratch/data/clean_conversation_dataset.csv")

# Quick sanity checks
print("Shape (rows × cols):", df.shape)
print("Column names:", df.columns.tolist())
print("\n--- First 5 rows ---")
display(df.head())

Shape (rows × cols): (3510, 3)
Column names: ['serial_no', 'question', 'answer']

--- First 5 rows ---


Unnamed: 0,serial_no,question,answer
0,0,"hi, how are you doing?",i'm fine. how about yourself?
1,1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,3,no problem. so how have you been?,i've been great. what about you?
4,4,i've been great. what about you?,i've been good. i'm in school right now.


In [3]:
# Double-check that everything is lowercase and ASCII-only
def has_non_ascii(s):
    try:
        s.encode("ascii")
        return False
    except UnicodeEncodeError:
        return True

num_non_ascii_q = df["question"].map(has_non_ascii).sum()
num_non_ascii_a = df["answer"].map(has_non_ascii).sum()
print(f"Non-ASCII in questions: {num_non_ascii_q}")
print(f"Non-ASCII in answers:   {num_non_ascii_a}")

Non-ASCII in questions: 0
Non-ASCII in answers:   0


In [4]:
#Peek at a couple of “longest” and “shortest” questions
df["q_len"] = df["question"].map(len)
df["a_len"] = df["answer"].map(len)

print("\n--- Top 3 longest questions ---")
display(df.sort_values("q_len", ascending=False)[["question", "q_len"]].head(3))

print("\n--- Top 3 shortest questions ---")
display(df.sort_values("q_len", ascending=True)[["question", "q_len"]].head(3))


--- Top 3 longest questions ---


Unnamed: 0,question,q_len
2492,what if you fall while you're holding the ligh...,97
3029,"i mean, someone used their dirty hands to pick...",88
3101,"when you're inside, you will always hear cars ...",88



--- Top 3 shortest questions ---


Unnamed: 0,question,q_len
914,so?,3
1246,no.,3
2914,oh.,3


### Re-define / Inspect normalize_text

In [5]:
import re
import unicodedata

def normalize_text(text: str) -> str:
    """
    1) Strip whitespace
    2) Lowercase
    3) Unicode NFKD normalize
    4) Replace non-ASCII chars with space
    5) Collapse multiple whitespace
    """
    # 1) Strip + lowercase
    s = text.strip().lower()
    # 2) Unicode normalize (NFKD)
    s = unicodedata.normalize("NFKD", s)
    # 3) Replace non-ASCII characters with space
    s = ''.join(c if ord(c) < 128 else ' ' for c in s)
    # 4) Collapse multiple spaces/newlines/tabs
    s = re.sub(r"\s+", " ", s).strip()
    return s


In [6]:
# Quick test of normalize_text
examples = [
    "  Hello, HOW are you?  ",
    "different thingsÂ—not the same thing all the time!",
    "I’m fine—thanks!  "
]
for ex in examples:
    print(f"Orig: {ex!r}\nNorm: {normalize_text(ex)!r}\n")

Orig: '  Hello, HOW are you?  '
Norm: 'hello, how are you?'

Orig: 'different thingsÂ—not the same thing all the time!'
Norm: 'different thingsa not the same thing all the time!'

Orig: 'I’m fine—thanks!  '
Norm: 'i m fine thanks!'



### Vectorize Questions with TF-IDF

In [7]:
# Extract questions (they’re already lowercase & ASCII-clean)

questions = df["question"].tolist()
answers   = df["answer"].tolist()

# Instantiate TfidfVectorizer
#    stop_words="english" removes common words like "the", "is", "and", etc.

vectorizer = TfidfVectorizer(stop_words="english")

In [8]:
# Fit on all questions and transform into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(questions)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(" (n_questions, n_unique_terms)")

TF-IDF matrix shape: (3510, 2098)
 (n_questions, n_unique_terms)


In [9]:
# 4) Inspect the first few feature names (i.e., the vocabulary)
feature_names = vectorizer.get_feature_names_out()
print("\n--- Sample of learned vocabulary ---")
print(feature_names[:20], " ... [total terms:", len(feature_names), "]")


--- Sample of learned vocabulary ---
['00' '000' '01' '10' '100' '101' '11' '12' '120' '13' '140' '15' '18'
 '1987' '20' '200' '2003' '22' '24' '25']  ... [total terms: 2098 ]


### Build get_answer() Function

In [10]:
def get_answer(user_input: str) -> str:
    """
    Returns the best-matching answer for a raw user_input string.
    1) Normalize the input
    2) Vectorize with TF-IDF (using our fitted 'vectorizer')
    3) Compute cosine similarity against 'tfidf_matrix'
    4) Return answer at index of highest similarity
    """
    # 1) Normalize
    clean_input = normalize_text(user_input)
    
    # 2) Vectorize (shape: 1 × n_terms)
    user_vec = vectorizer.transform([clean_input])
    
    # 3) Cosine similarity (1 × n_questions array)
    sims = cosine_similarity(user_vec, tfidf_matrix)
    
    # 4) Index of best match
    best_idx = sims.argmax()
    
    # 5) Return the stored answer
    return answers[best_idx]

In [11]:
# Quick tests
tests = [
    "hi, how are you doing?",
    "i am fine. how about you?",
    "are you serious?",
    "tell me something else"
]

for t in tests:
    print(f"You: {t!r}\nBot: {get_answer(t)}\n")

You: 'hi, how are you doing?'
Bot: i'm fine. how about yourself?

You: 'i am fine. how about you?'
Bot: i'm pretty good. thanks for asking.

You: 'are you serious?'
Bot: i'm fine. how about yourself?

You: 'tell me something else'
Bot: they always make "what's next" sound exciting, but it never is.



### Interactive “Chat” Simulation

In [12]:
# Simple Input Loop (type & run cell each time)

print("🤖 Chatbot (Rule Based) Ready. Type 'exit' to quit.\n")

while True:
    user_text = input("You: ")
    if user_text.strip().lower() == "exit":
        print("Bot: Goodbye! 👋")
        break
    elif not user_text.strip():
        print("Bot: Please type something or 'exit' to quit.")
        continue
    else:
        bot_resp = get_answer(user_text)
        print("Bot:", bot_resp)

🤖 Chatbot (Rule Based) Ready. Type 'exit' to quit.

Bot: i'm fine. how about yourself?
Bot: i'm pretty good. thanks for asking.
Bot: i'm doing well. how about you?
Bot: Goodbye! 👋


In [13]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# 2) Create text input box & output area
input_box = widgets.Text(placeholder="Type your message here...")
send_button = widgets.Button(description="Send")
chat_area = widgets.Output()

# 3) Define what happens when “Send” is clicked
def on_send_clicked(b):
    user_msg = input_box.value
    if not user_msg.strip():
        return
    with chat_area:
        clear_output(wait=True)
        print("You:", user_msg)
        if user_msg.strip().lower() == "exit":
            print("Bot: Goodbye! 👋")
        else:
            response = get_answer(user_msg)
            print("Bot:", response)
    input_box.value = ""  # clear input

send_button.on_click(on_send_clicked)

# 4) Layout
display(chat_area, widgets.HBox([input_box, send_button]))

Output()

HBox(children=(Text(value='', placeholder='Type your message here...'), Button(description='Send', style=Butto…