# __Creating a Chatbot__

### __Name:__
Tonia Ethuakhor

### __Date:__
February 2025

### __1. Library Installation & Setup__

In [10]:
# Import essential libraries

import nltk  # For text processing (tokenising sentences and words)
import numpy as np  # For numerical operations and handling arrays
from nltk.tokenize import sent_tokenize  # Splits long text into individual sentences
from sentence_transformers import SentenceTransformer  # AI model to convert text into embeddings
from sklearn.metrics.pairwise import cosine_similarity  # To compare question and text embeddings
import os  # For file handling (checking files, reading, writing)

# Download the NLTK sentence tokenizer
nltk.download('punkt')  # Required to split text into sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### __2. Data Loading & Chunking__

In [15]:
# This section opens and reads the text files containing the biographies of Helen Keller and Nikola Tesla 
# and breaks them into individual sentences. Each sentence becomes a small, searchable piece of information.
# Using sentences (instead of whole paragraphs) allows the chatbot to return # clear and precise answers 
#rather than the entire biography.

def load_and_chunk(file_path):
    """
    Opens a text file and splits its content into individual sentences.
    Each sentence is treated as a separate chunk that the chatbot can compare
    against the user's question.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
    
    # Split the text into sentences and ignore very short ones
    chunks = [
        sentence.strip()
        for sentence in sent_tokenize(raw_text)
        if len(sentence.strip()) > 20
    ]
    
    return chunks

### __3. Question Matching Function__

In [16]:
# This section compares the user's question with all stored text chunks using semantic similarity. 
# It identifies the most relevant paragraph by converting both the question and the text into embeddings 
# and selecting the closest match based on cosine similarity.

def ask_chatbot(query, chunks, model):
    """
    Converts text and user question into embeddings and finds the closest match.
    Returns the most relevant paragraph and the match confidence.
    """
    # Convert text into embeddings (numbers that AI can compare)
    chunk_embeddings = model.encode(chunks)
    query_embedding = model.encode([query])
    
    # Compare the question to all chunks
    scores = cosine_similarity(query_embedding, chunk_embeddings)
    best_match_index = np.argmax(scores)
    
    return chunks[best_match_index], scores[0][best_match_index]

### __4. Interactive Chat Session__

In [17]:
# This section handles the interactive chatbot session. It continuously accepts user questions, 
# retrieves the best-matching answer from the knowledge base, and displays it until the user
# chooses to exit the program.

def start_chatting(chunks, model):
    """
    Starts an interactive chat session.
    The user can type questions and end the session by typing 'exit' or 'quit'.
    """
    print("\n" + "=" * 45)
    print("AI CHATBOT IS ONLINE")
    print("Type your question and press Enter.")
    print("Type 'exit' or 'quit' to end the session.")
    print("=" * 45 + "\n")

    while True:
        user_input = input("You: ").strip()

        # Exit condition to prevent the program from running forever
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Chatbot: Session ended. Goodbye!")
            break

        # Ignore empty input
        if not user_input:
            print("Chatbot: Please type a question or 'exit' to quit.")
            continue

        answer, confidence = ask_chatbot(user_input, chunks, model)

        print(f"\nChatbot: {answer}")
        print(f"(Match Confidence: {confidence:.2f})")
        print("-" * 45)

### __5. Launching the Chatbot__

In [18]:
# Step A: Load the AI Model (the "brain" of the bot)
print("Loading AI model... please wait...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step B: Load and prepare both data files
chunks_hk = load_and_chunk("helen_keller.txt")      
chunks_nt = load_and_chunk("nikola_tesla.txt")      

# Combine the chunks from both files into a single list
data_chunks = chunks_hk + chunks_nt

# Step C: Start the interactive chat
start_chatting(data_chunks, model)

Loading AI model... please wait...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



AI CHATBOT IS ONLINE
Type your question and press Enter.
Type 'exit' or 'quit' to end the session.



You:  exit


Chatbot: Session ended. Goodbye!
