# Environment Setup
## Installations

In [None]:
#Installations
!pip install -U sentence-transformers
!pip install nltk
!pip install torch
!pip install scikit-learn

## Imports

In [None]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
import torch
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
import matplotlib.pyplot as plt

### Import Error Handling

In [None]:
try:
    from google.colab import files
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

## Downloads

In [None]:
#Downloads
nltk.download('punkt_tab')

## Initializing

In [None]:
corpus = []
corpus_embeddings = None
debug_log = []

## Main RAG Context File Creation

In [None]:
#Creating RAG File
rag_context = """
Programming: Programming is the process of writing instructions for computers to perform tasks, using languages like Python, JavaScript, or Java. It involves creating algorithms, managing data, and solving problems through code.
Variables: Variables store data in a program, like numbers or text. They have names and values, e.g., `x = 5` in Python. Variables allow dynamic data manipulation and are fundamental to coding.
Data Types: Data types define the kind of data a variable holds, e.g., integers (whole numbers), floats (decimals), strings (text), or booleans (True/False). Understanding types ensures proper data handling.
Control Structures: Control structures like loops and conditionals manage program flow. If-statements execute code based on conditions, e.g., `if x > 0: print('Positive')`. Loops repeat tasks efficiently.
Loops: Loops repeat code blocks. For-loops iterate over sequences, e.g., `for i in range(5): print(i)`. While-loops run until a condition is false, e.g., `while x < 5: x += 1`.
Functions: Functions are reusable code blocks that perform specific tasks, e.g., `def add(a, b): return a + b`. They improve code organization, readability, and modularity.
Lists: Lists store multiple items in a single variable, e.g., `numbers = [1, 2, 3]` in Python. They support operations like appending, indexing, and slicing for data manipulation.
Dictionaries: Dictionaries store key-value pairs, e.g., `person = {'name': 'Alice', 'age': 25}`. They allow fast data retrieval using keys, useful for structured data.
Object-Oriented Programming (OOP): OOP organizes code into objects with attributes and methods, e.g., classes in Python. It promotes code reuse and modularity, like `class Dog: def bark(self): print('Woof')`.
Error Handling: Error handling manages runtime errors using try-except blocks, e.g., `try: x = 1/0 except ZeroDivisionError: print('Cannot divide by zero')`. It ensures robust programs.
Input/Output: Input/Output manages user interaction, e.g., `input('Enter name: ')` for input and `print('Hello')` for output. File I/O reads/writes data to files, like `open('file.txt', 'r')`.
Debugging: Debugging finds and fixes code errors. Techniques include using print statements, debuggers, or tools like VS Code’s debugger to trace issues and ensure correct program behavior.
Algorithms: Algorithms are step-by-step procedures for solving problems, like sorting or searching. Examples include bubble sort for ordering lists or binary search for finding items efficiently.
Data Structures: Data structures organize data, e.g., arrays, linked lists, stacks, or queues. They optimize tasks like searching or sorting, critical for efficient programming.
Functions vs Methods: Functions are standalone, while methods belong to objects in OOP. For example, `len(list)` is a function, but `list.append(item)` is a method of the list object.
Conditional Statements: Conditionals control program flow based on conditions, e.g., `if-elif-else` in Python. They allow programs to make decisions, like `if score >= 90: print('A')`.
Modules: Modules are reusable code files, e.g., `import math` in Python. They provide functions like `math.sqrt(16)` and organize large projects by separating code into files.
Python Basics: Python is a versatile, beginner-friendly language. It uses simple syntax, e.g., `print('Hello, World!')`, and supports web development, data analysis, and automation.
JavaScript Basics: JavaScript is used for web development, adding interactivity to websites. Example: `console.log('Hello')`. It runs in browsers and supports event-driven programming.
Version Control: Version control systems like Git track code changes. Commands like `git commit` save snapshots, enabling collaboration and code recovery.
Arrays: Arrays are fixed-size data structures storing elements of the same type, e.g., `[1, 2, 3]` in JavaScript. They allow indexed access and are used for efficient data storage.
Strings: Strings are sequences of characters, e.g., `name = "Alice"` in Python. Operations like concatenation (`"Hello " + "World"`) and slicing (`name[0:3]`) manipulate text.
Tuples: Tuples are immutable lists in Python, e.g., `point = (3, 4)`. They are used for fixed data collections, offering faster access than lists due to immutability.
Sets: Sets store unique elements, e.g., `unique_nums = {1, 2, 3}` in Python. They support operations like union and intersection, useful for removing duplicates or comparing collections.
Recursion: Recursion is when a function calls itself to solve smaller problems, e.g., factorial: `def fact(n): return 1 if n == 0 else n * fact(n-1)`. It simplifies complex tasks like tree traversal.
File Handling: File handling involves reading/writing files, e.g., `with open('data.txt', 'w') as f: f.write('Hello')`. It’s critical for data persistence in programming.
Regular Expressions: Regular expressions (regex) match patterns in text, e.g., `re.search(r'\d+', 'abc123')` finds numbers. They’re used for validation and data extraction.
APIs: APIs allow programs to communicate, e.g., fetching data with `requests.get('https://api.example.com')` in Python. They enable integration with external services like weather or stock APIs.
Lists Comprehension: List comprehensions create lists concisely, e.g., `[x**2 for x in range(5)]` in Python yields `[0, 1, 4, 9, 16]`. They simplify data transformations.
Lambda Functions: Lambda functions are anonymous, single-line functions, e.g., `lambda x: x*2` in Python. They’re used for short, throwaway operations in functional programming.
"""

In [None]:
#Saving RAG Context File as txt
file_path = 'rag_context.txt'
try:
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(rag_context.strip())
    print(f"Created {file_path} in {os.getcwd()}")
except Exception as e:
    print(f"Error creating {file_path}: {e}")

In [None]:
# Checking RAG File Path
print(f"RAG Context File Exists T/F: {os.path.exists('/content/rag_context.txt')}")

## Main Retrieval Database Creation

### Customized Greeting and Basic Questions/Responses Pairs




In [None]:
#Creating Custom Greetings and Responses
greetings = [
    ("Hi there!", "Hello, coder! Ready to dive into programming concepts?", "Greeting"),
    ("Good morning!", "Good morning! Let’s start coding today!", "Greeting"),
    ("Hey bot!", "Hey! What coding question can I answer for you?", "Greeting"),
    ("Hello!", "Hi! I’m here to help with your programming journey.", "Greeting"),
    ("Yo!", "Yo, coder! Let’s debug some code!", "Greeting"),
    ("What's up?", "What’s up? Ready to learn Python or JavaScript?", "Greeting"),
    ("Howdy!", "Howdy! Let’s write some awesome code today.", "Greeting"),
    ("Greetings!", "Greetings! Curious about coding concepts?", "Greeting"),
    ("Hi!", "Hi! Want to master programming basics?", "Greeting"),
    ("Good evening!", "Evening! Let’s tackle a coding problem.", "Greeting"),
    ("Hey!", "Hey there! What’s your coding query today?", "Greeting"),
    ("Sup!", "Sup! Let’s talk loops and functions.", "Greeting"),
    ("Good day!", "Good day! Ready to code like a pro?", "Greeting"),
    ("Nice to meet you!", "Nice to meet you! Let’s learn programming.", "Greeting"),
    ("Yo bot!", "Yo! Ask me anything about coding.", "Greeting"),
    ("Salutations!", "Salutations! Ready to explore Python?", "Greeting"),
    ("Hey assistant!", "Hey! What coding topic can I explain?", "Greeting"),
    ("Hi friend!", "Hello, friend! Let’s code something cool.", "Greeting"),
    ("Hello robot!", "Hi! I’m your programming tutor bot.", "Greeting"),
    ("Bot, hello!", "Greetings! What coding concept do you want?", "Greeting"),
    ("Good to see you!", "Great to see you! Got a coding question?", "Greeting"),
    ("Hola!", "Hola! Ready to program in Python or JS?", "Greeting"),
    ("Hey AI!", "Hey coder! What’s your programming puzzle?", "Greeting"),
    ("Hi chatbot!", "Hi! Ask me about coding basics.", "Greeting"),
    ("Hi guru!", "Hello! I’m your coding guru today.", "Greeting"),
    ("Good night!", "Good night! Coding questions never sleep!", "Greeting"),
    ("Hi code bot!", "Hey! Let’s boost your coding skills.", "Greeting"),
    ("Yo assistant!", "Yo! Curious about programming?", "Greeting"),
    ("Hello genius!", "Haha, thanks! Let’s code like geniuses.", "Greeting"),
    ("Hi helper!", "Hi! Here to help with coding queries.", "Greeting"),
    ("Hello dear!", "Hello! Ready to master coding concepts?", "Greeting"),
    ("Hi pal!", "Hey pal! Want to learn about loops?", "Greeting"),
    ("Greetings bot!", "Greetings! Let’s decode programming.", "Greeting"),
    ("Hi team!", "Hello! Ready to team up on coding?", "Greeting"),
    ("Wassup!", "Wassup! Time to write some code?", "Greeting"),
    ("Hi sir!", "Hey! Let’s tackle your coding doubts.", "Greeting"),
    ("Yo code master!", "Yo! Let’s master programming.", "Greeting"),
    ("Hi legend!", "Thanks! How can I help with coding?", "Greeting"),
    ("Hi expert!", "Hi! Ask me anything about programming.", "Greeting"),
    ("Yo techie!", "Yo! This techie knows code!", "Greeting"),
    ("Hi mentor!", "Hi! Let’s plan your coding journey.", "Greeting"),
    ("Hi assistant bot!", "Hi! Your programming assistant here.", "Greeting"),
    ("Hi coder!", "Hey! Let’s write some clean code!", "Greeting"),
    ("Hey chat!", "Hey! What coding topic today?", "Greeting"),
    ("Hey programmer bot!", "Hey! Ask me about coding tricks.", "Greeting"),
    ("Hi wise one!", "Thanks! Let’s talk code wisdom.", "Greeting"),
    ("Good noon!", "Good noon! Time to debug!", "Greeting"),
    ("Hello smart bot!", "Hi! What’s your coding question?", "Greeting"),
    ("Hi coder bot!", "Hello! Let’s solve coding problems.", "Greeting"),
    ("Hi debugger!", "Hi! Ready to squash some bugs?", "Greeting"),
    ("Yes", "Let’s code!", "Greeting"),
    ("Who are you?", "I’m ProgBot, your friendly programming tutor bot!", "Greeting"),
    ("What is a variable?", "A variable stores data like numbers or text, e.g., `x = 5` in Python. It’s like a labeled box for values.", "Response"),
    ("What are data types?", "Data types define what a variable holds, like integers (e.g., 5), floats (e.g., 3.14), strings (e.g., 'hello'), or booleans (True/False).", "Response"),
    ("What is a loop?", "A loop repeats code. For-loops iterate over sequences, e.g., `for i in range(5): print(i)`. While-loops run until a condition is false.", "Response"),
    ("What is a function?", "A function is a reusable code block for a task, e.g., `def add(a, b): return a + b`. It improves code organization.", "Response"),
    ("What is OOP?", "Object-Oriented Programming (OOP) organizes code into objects with attributes and methods, e.g., `class Dog: def bark(self): print('Woof')`.", "Response"),
    ("How do I debug?", "Debugging finds and fixes errors using print statements, breakpoints, or tools like VS Code’s debugger to trace issues.", "Response"),
    ("What is a list?", "A list stores multiple items, e.g., `numbers = [1, 2, 3]` in Python. You can append, index, or slice it.", "Response"),
    ("What is a dictionary?", "A dictionary stores key-value pairs, e.g., `person = {'name': 'Alice', 'age': 25}`. Keys allow fast data retrieval.", "Response"),
    ("What is error handling?", "Error handling uses try-except blocks to manage errors, e.g., `try: x = 1/0 except ZeroDivisionError: print('Cannot divide')`.", "Response"),
    ("What is Python?", "Python is a beginner-friendly language with simple syntax, e.g., `print('Hello, World!')`. It’s used for web, data, and automation.", "Response"),
    ("What is JavaScript?", "JavaScript adds interactivity to websites, e.g., `console.log('Hello')`. It runs in browsers and supports events.", "Response"),
    ("What is a conditional?", "Conditionals control flow with if-elif-else, e.g., `if score >= 90: print('A')`. They make decisions in code.", "Response"),
    ("What is a module?", "A module is a reusable code file, e.g., `import math`. It provides functions like `math.sqrt(16)` for organization.", "Response"),
    ("What is recursion?", "Recursion is a function calling itself to solve smaller problems, e.g., `def fact(n): return 1 if n == 0 else n * fact(n-1)`.", "Response"),
    ("What is a string?", "A string is a sequence of characters, e.g., `name = 'Alice'`. You can concatenate or slice it, like `name[0:3]`.", "Response"),
    ("What is a tuple?", "A tuple is an immutable list, e.g., `point = (3, 4)` in Python. It’s faster and used for fixed data.", "Response"),
    ("What is a set?", "A set stores unique elements, e.g., `unique_nums = {1, 2, 3}`. It supports union and intersection operations.", "Response"),
    ("What is an array?", "An array stores elements of the same type, e.g., `[1, 2, 3]` in JavaScript. It allows indexed access.", "Response"),
    ("What is file handling?", "File handling reads/writes files, e.g., `with open('data.txt', 'w') as f: f.write('Hello')` for data persistence.", "Response"),
    ("What is regex?", "Regular expressions (regex) match text patterns, e.g., `re.search(r'\d+', 'abc123')` finds numbers for validation.", "Response"),
    ("What is an API?", "An API lets programs communicate, e.g., `requests.get('https://api.example.com')` fetches data from external services.", "Response"),
    ("What is list comprehension?", "List comprehension creates lists concisely, e.g., `[x**2 for x in range(5)]` yields `[0, 1, 4, 9, 16]` in Python.", "Response"),
    ("What is a lambda function?", "A lambda function is an anonymous, single-line function, e.g., `lambda x: x*2`. It’s used for quick operations.", "Response"),
    ("What is version control?", "Version control like Git tracks code changes, e.g., `git commit` saves snapshots for collaboration and recovery.", "Response"),
    ("How do I write a loop?", "Use a for-loop, e.g., `for i in range(5): print(i)`, or while-loop, e.g., `while x < 5: x += 1`, to repeat tasks.", "Response"),
    ("How do I write a function?", "Define a function with `def`, e.g., `def greet(name): return f'Hello, {name}'`. Call it like `greet('Alice')`.", "Response"),
    ("How do I handle errors?", "Use try-except, e.g., `try: result = 10/0 except ZeroDivisionError: print('Error: Division by zero')`.", "Response"),
    ("How do I use lists?", "Create a list, e.g., `mylist = [1, 2, 3]`. Append with `mylist.append(4)`, access with `mylist[0]`, or slice with `mylist[1:3]`.", "Response"),
    ("How do I use dictionaries?", "Create a dictionary, e.g., `d = {'key': 'value'}`. Access with `d['key']`, add with `d['new'] = 'value'`.", "Response"),
]

### Advanced Programming Question/Answer Pairs

In [None]:
#Creating Programming Q/A Pairs
qa_pairs = [
    ("What is a dictionary?", "A dictionary stores key-value pairs, e.g., `person = {'name': 'Alice', 'age': 25}` in Python. Access values with keys, like `person['name']`.", "Programming"),
    ("How do I use a dictionary?", "Create a dictionary with `d = {'key': 'value'}`. Access with `d['key']`, add with `d['new'] = 'value'`, or remove with `del d['key']`.", "Programming"),
    ("What is a tuple?", "A tuple is an immutable list, e.g., `point = (3, 4)` in Python. It’s faster and used for fixed data, accessed like `point[0]`.", "Programming"),
    ("What is a set?", "A set stores unique elements, e.g., `unique_nums = {1, 2, 3}`. It supports operations like union (`|`) or intersection (`&`).", "Programming"),
    ("What is recursion?", "Recursion is a function calling itself to solve smaller problems, e.g., `def fact(n): return 1 if n == 0 else n * fact(n-1)`.", "Programming"),
    ("How do I write a recursive function?", "Define a base case and recursive case, e.g., `def fib(n): return n if n <= 1 else fib(n-1) + fib(n-2)` for Fibonacci numbers.", "Programming"),
    ("What is debugging?", "Debugging finds and fixes code errors using print statements, breakpoints, or tools like VS Code’s debugger to trace issues.", "Programming"),
    ("How do I debug code?", "Use print statements, e.g., `print(variable)`, set breakpoints in an IDE, or step through code with a debugger to find errors.", "Programming"),
    ("What is a string?", "A string is a sequence of characters, e.g., `name = 'Alice'`. Use operations like concatenation (`'Hello ' + 'World'`) or slicing (`name[0:3]`).", "Programming"),
    ("How do I manipulate strings?", "Concatenate with `+`, e.g., `'Hello ' + 'World'`, slice with `text[1:4]`, or use methods like `text.upper()` or `text.replace('a', 'b')`.", "Programming"),
    ("What is an array?", "An array stores elements of the same type, e.g., `[1, 2, 3]` in JavaScript. Access with indices, like `array[0]`. In Python, lists are similar.", "Programming"),
    ("What is file handling?", "File handling reads/writes files, e.g., `with open('data.txt', 'w') as f: f.write('Hello')` for writing, or `'r'` for reading.", "Programming"),
    ("How do I read a file?", "Use `with open('file.txt', 'r') as f: content = f.read()` to read all text, or `f.readlines()` for a list of lines.", "Programming"),
    ("What is a regular expression?", "Regular expressions (regex) match text patterns, e.g., `re.search(r'\d+', 'abc123')` finds numbers for validation or extraction.", "Programming"),
    ("How do I use regex?", "Use `re` module, e.g., `import re; match = re.search(r'[a-z]+', 'Hello123')` to find lowercase letters. Test patterns with `re.compile()`.", "Programming"),
    ("What is an API?", "An API lets programs communicate, e.g., `requests.get('https://api.example.com')` in Python fetches data from external services.", "Programming"),
    ("How do I use an API?", "Use `requests` in Python, e.g., `import requests; response = requests.get('https://api.example.com/data')` to fetch JSON data.", "Programming"),
    ("What is list comprehension?", "List comprehension creates lists concisely, e.g., `[x**2 for x in range(5)]` yields `[0, 1, 4, 9, 16]` in Python.", "Programming"),
    ("What is a lambda function?", "A lambda function is an anonymous function, e.g., `lambda x: x*2`. Use it for short operations, like `map(lambda x: x*2, [1, 2, 3])`.", "Programming"),
    ("What is version control?", "Version control like Git tracks code changes, e.g., `git commit -m 'Update'` saves snapshots for collaboration and recovery.", "Programming"),
    ("How do I use Git?", "Initialize with `git init`, stage with `git add .`, commit with `git commit -m 'message'`, and push to a repo with `git push origin main`.", "Programming"),
    ("What are modules?", "Modules are reusable code files, e.g., `import math` provides functions like `math.sqrt(16)`. Organize code with `import mymodule`.", "Programming"),
    ("What is a conditional statement?", "Conditionals control flow, e.g., `if x > 0: print('Positive') elif x == 0: print('Zero') else: print('Negative')`.", "Programming"),
    ("What is Python?", "Python is a beginner-friendly language with simple syntax, e.g., `print('Hello, World!')`, used for web, data analysis, and automation.", "Programming"),
    ("What is JavaScript?", "JavaScript adds interactivity to websites, e.g., `console.log('Hello')`. It runs in browsers and supports event-driven coding.", "Programming"),
    ("What is a stack?", "A stack is a LIFO (Last In, First Out) data structure, e.g., implemented with a list in Python: `stack.append(1); stack.pop()`.", "Programming"),
    ("What is a queue?", "A queue is a FIFO (First In, First Out) data structure, e.g., use `from collections import deque; q = deque([1, 2]); q.popleft()`.", "Programming"),
    ("What is a linked list?", "A linked list is a data structure with nodes containing data and pointers to the next node, e.g., `class Node: def __init__(self, data): self.data = data; self.next = None`.", "Programming"),
    ("What is a binary search?", "Binary search finds an item in a sorted list by halving the search space, e.g., `def binary_search(arr, x):` with O(log n) complexity.", "Programming"),
    ("How do I sort a list?", "In Python, use `list.sort()` for in-place sorting or `sorted(list)` for a new sorted list, e.g., `sorted([3, 1, 2])` returns `[1, 2, 3]`.", "Programming"),
    ("What is a class method?", "A class method operates on the class, e.g., `@classmethod def from_string(cls, s):` in Python, using `cls` instead of `self`.", "Programming"),
    ("What is inheritance?", "Inheritance allows a class to inherit attributes and methods, e.g., `class Animal: def speak(self): pass; class Dog(Animal): def speak(self): return 'Woof'`.", "Programming"),
    ("What is polymorphism?", "Polymorphism lets different classes share method names, e.g., `Dog().speak()` and `Cat().speak()` behave differently but are called the same.", "Programming"),
    ("What is a closure?", "A closure is a function that retains access to its outer scope’s variables, e.g., `def outer(x): def inner(y): return x + y; return inner`.", "Programming"),
    ("What is a decorator?", "A decorator modifies a function’s behavior, e.g., `def my_decorator(func): def wrapper(): print('Before'); func(); print('After'); return wrapper`.", "Programming"),
    ("How do I write a list comprehension?", "Write `[expression for item in iterable if condition]`, e.g., `[x for x in range(10) if x % 2 == 0]` yields `[0, 2, 4, 6, 8]`.", "Programming"),
    ("What is a generator?", "A generator yields values one at a time, e.g., `def my_gen(): for i in range(3): yield i` saves memory for large datasets.", "Programming"),
    ("What is a dictionary comprehension?", "A dictionary comprehension creates dictionaries, e.g., `{x: x**2 for x in range(5)}` yields `{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}`.", "Programming"),
    ("What is a try-except block?", "A try-except block catches errors, e.g., `try: x = int('abc') except ValueError: print('Invalid number')` handles invalid inputs.", "Programming"),
    ("What is a breakpoint?", "A breakpoint pauses code execution for debugging, set in IDEs like VS Code or with `import pdb; pdb.set_trace()` in Python.", "Programming"),
    ("What is a package in Python?", "A package is a directory of modules, e.g., `my_package/` with `__init__.py`. Import with `from my_package import module`.", "Programming"),
    ("What is time complexity?", "Time complexity measures an algorithm’s efficiency, e.g., O(n) for linear search, O(log n) for binary search, O(n^2) for bubble sort.", "Programming"),
    ("What is a hash table?", "A hash table stores key-value pairs with a hash function, e.g., Python’s dictionary, offering O(1) average-case lookup time.", "Programming"),
    ("How do I write a class constructor?", "Use `__init__`, e.g., `class Person: def __init__(self, name): self.name = name` to initialize object attributes.", "Programming"),
    ("What is a static method?", "A static method doesn’t use instance or class data, e.g., `@staticmethod def add(x, y): return x + y` in Python.", "Programming"),
    ("What is a loop break?", "A `break` statement exits a loop early, e.g., `for i in range(10): if i == 5: break; print(i)` stops at 5.", "Programming"),
    ("What is a loop continue?", "A `continue` statement skips to the next loop iteration, e.g., `for i in range(5): if i == 2: continue; print(i)` skips 2.", "Programming"),
    ("What is a default parameter?", "A default parameter sets a default value, e.g., `def greet(name='User'): return f'Hello, {name}'` uses 'User' if no argument is given.", "Programming"),
    ("What is a keyword argument?", "A keyword argument specifies parameter names, e.g., `def func(a, b): pass; func(b=2, a=1)` allows flexible argument order.", "Programming"),
    ("What is a docstring?", "A docstring documents a function, e.g., `def add(a, b): '''Returns sum of a and b.''' return a + b` for clarity.", "Programming"),
]


### Creating the Dataset

In [None]:
#Combining Interactions
all_pairs = greetings + qa_pairs

In [None]:
#Creating Dataset
ab = pd.DataFrame(all_pairs, columns=["Greeting", "Response", "Category"])

In [None]:
#Creating a csv File for Reviewal in SideBar
df=ab.to_csv("DataSet 2.0", index=False)

In [None]:
#Reading DataSet
df=pd.read_csv("DataSet 2.0")

In [None]:
#DataSet 2.0
df.head()

# Pre-processing and Model Set-up

## Defining Model

In [None]:
#Defining Pre-Trained Model
model = SentenceTransformer('all-MiniLM-L12-v2')

## Pre-Processing

In [None]:
#Cleaning Dataframe
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
#DataSet 2.0 Post-Cleaning
df.head()

In [None]:
#Defining Text Pre-Processing Function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.lower().strip()


In [None]:
#Dataframe Pre-Processing
df['RespLen'] = df['Response'].apply(len)
df['Flagged'] = df['RespLen'] < 30

In [None]:
corpus = df['Greeting'].apply(clean_text).tolist()
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

In [None]:
#DataSet 2.0 Post-Pre-Processing
corpus

## Error Handling for RAG

In [None]:
#RAG Backup: In Case Primary RAG File Not Found
#Defining Default RAG Context

def initialize_rag_context(file_path='rag_context.txt'):
    default_context = """
Programming: Programming is writing instructions in languages like Python or JavaScript to execute tasks, solve problems, or process data. It involves algorithms, data management, and logic.
Variables: Variables store data like numbers or text, e.g., `x = 5` in Python. They’re used to hold and manipulate values, acting as labeled containers in a program.
Data Types: Data types define a variable’s content, e.g., integers (5), floats (3.14), strings ('hello'), booleans (True/False), lists ([1, 2, 3]), or dictionaries ({'key': 'value'}).
Control Structures: Control structures like loops and conditionals direct program flow. If-statements execute based on conditions, e.g., `if x > 0: print('Positive')`. Loops repeat tasks.
Loops: Loops repeat code blocks. For-loops iterate over sequences, e.g., `for i in range(5): print(i)`. While-loops run until a condition is false, e.g., `while x < 5: x += 1`.
Functions: Functions are reusable code blocks for tasks, e.g., `def add(a, b): return a + b`. They improve modularity and readability, called like `add(2, 3)`.
Lists: Lists store multiple items, e.g., `numbers = [1, 2, 3]`. Operations include appending (`numbers.append(4)`), indexing (`numbers[0]`), and slicing (`numbers[1:3]`).
Dictionaries: Dictionaries store key-value pairs, e.g., `person = {'name': 'Alice', 'age': 25}`. Access values with keys, e.g., `person['name']`, for fast data retrieval.
Object-Oriented Programming (OOP): OOP uses objects with attributes and methods, e.g., `class Dog: def bark(self): print('Woof')`. It promotes reusable, modular code.
Error Handling: Error handling manages runtime errors with try-except blocks, e.g., `try: x = 1/0 except ZeroDivisionError: print('Cannot divide by zero')`.
Input/Output: I/O handles user interaction, e.g., `input('Enter name: ')` for input, `print('Hello')` for output. File I/O uses `open('file.txt', 'r')` for reading/writing.
Debugging: Debugging finds and fixes errors using print statements, breakpoints, or IDE debuggers like VS Code to trace code execution and resolve issues.
Algorithms: Algorithms are step-by-step solutions, e.g., bubble sort for ordering lists or binary search for finding items. They optimize tasks like searching or sorting.
Data Structures: Data structures organize data, e.g., arrays, linked lists, stacks (LIFO), or queues (FIFO). They improve efficiency for specific tasks.
Python Basics: Python is a beginner-friendly language with simple syntax, e.g., `print('Hello, World!')`. It supports web development, data analysis, and automation.
JavaScript Basics: JavaScript adds interactivity to websites, e.g., `console.log('Hello')`. It runs in browsers, supporting event-driven programming.
Version Control: Version control with Git tracks code changes, e.g., `git commit -m 'Update'` saves snapshots for collaboration and recovery.
Arrays: Arrays store elements of the same type, e.g., `[1, 2, 3]` in JavaScript. In Python, lists are used similarly, accessed via indices like `array[0]`.
Strings: Strings are character sequences, e.g., `name = 'Alice'`. Operations include concatenation (`'Hello ' + 'World'`) and slicing (`name[0:3]`).
Tuples: Tuples are immutable lists, e.g., `point = (3, 4)`. They’re faster and used for fixed data, accessed like `point[0]`.
Sets: Sets store unique elements, e.g., `unique_nums = {1, 2, 3}`. They support union (`|`) and intersection (`&`) for data comparison.
Recursion: Recursion involves functions calling themselves, e.g., `def fact(n): return 1 if n == 0 else n * fact(n-1)` for factorials.
File Handling: File handling reads/writes files, e.g., `with open('data.txt', 'w') as f: f.write('Hello')` for persistence.
Regular Expressions: Regex matches text patterns, e.g., `re.search(r'\d+', 'abc123')` finds numbers for validation or extraction.
APIs: APIs enable program communication, e.g., `requests.get('https://api.example.com')` fetches data from external services.
List Comprehension: List comprehension creates lists concisely, e.g., `[x**2 for x in range(5)]` yields `[0, 1, 4, 9, 16]`.
Lambda Functions: Lambda functions are anonymous, e.g., `lambda x: x*2`. They’re used for quick operations in functional programming.
"""
    if not os.path.exists(file_path):
        print(f"⚠️ RAG context file {file_path} not found. Creating with default context.")
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(default_context.strip())
        except Exception as e:
            print(f"Error creating RAG context file: {e}")
            return default_context
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except Exception as e:
        print(f"Error loading RAG context: {e}")
        return default_context

# Natural Language Processing

In [None]:
#Defining Tokenisation for Token-Based Retrieval from RAG

def tokenize_context(context, max_words=55):
    sentences = context.split('. ')
    segments = []
    current_segment = []
    word_count = 0
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        if word_count + len(words) <= max_words:
            current_segment.append(sentence)
            word_count += len(words)
        else:
            segments.append('. '.join(current_segment) + '.')
            current_segment = [sentence]
            word_count = len(words)
    if current_segment:
        segments.append('. '.join(current_segment) + '.')
    return segments

In [None]:
#Initialising RAG Context
rag_context = initialize_rag_context('rag_context.txt')

In [None]:
#Making RAG Segments
rag_segments = tokenize_context(rag_context) if rag_context else []

In [None]:
#Making RAG Embeddings
rag_embeddings = model.encode(rag_segments, convert_to_tensor=True) if rag_segments else torch.tensor([])

In [None]:
#RAG Context File Post-Pre-Processing
rag_context

In [None]:
#RAG Context File Segment Counts
segment_count = len(rag_segments)
print(f"RAG Segments Count: {segment_count}")


# Retrieval Function

In [None]:
#Defining Main Retrieval Function
def get_response(user_input, threshold=0.8, silent=False):
    global debug_log
    try:
        # Input validation: Sanitize and check for valid input
        if not user_input or not isinstance(user_input, str):
            debug_log.append(f"Invalid input: {user_input}")
            return "Please provide a valid question about programming.", "error"

        user_input_clean = clean_text(user_input)
        # Input validation: Check for harmful patterns (e.g., code injection)
        if re.search(r'[<>{};]', user_input_clean):
            debug_log.append(f"Potentially harmful input detected: {user_input_clean}")
            return "Invalid input detected. Please ask a programming-related question.", "error"

        # Hardcoded response for identity and greeting queries
        if 'who are you' in user_input_clean.lower():
            return "I'm ProgBot, your friendly programming tutor bot! Ready to help with coding questions.", 'hardcoded'
        if user_input_clean.lower() in ['hello', 'hi', 'hey']:
            return "Hello, coder! Ready to dive into programming?", 'hardcoded'
        if 'what is programming' in user_input_clean.lower():
            return "Programming is writing instructions in languages like Python or JavaScript to execute tasks, solve problems, or process data. It involves algorithms, data management, and logic.", 'hardcoded'

        # Encode user input with error handling
        try:
            query_embedding = model.encode(user_input_clean, convert_to_tensor=True)
        except Exception as e:
            debug_log.append(f"Model encoding error: {str(e)}")
            return "Error processing your question. Please try again.", "error"

        # Check user-corrected responses first with lower threshold
        user_corrected = df[df['Category'] == 'UserCorrected']
        if not user_corrected.empty:
            try:
                corrected_corpus = user_corrected['Greeting'].apply(clean_text).tolist()
                corrected_embeddings = model.encode(corrected_corpus, convert_to_tensor=True)
                corrected_similarity = util.pytorch_cos_sim(query_embedding, corrected_embeddings)
                corrected_score, corrected_index = corrected_similarity.max(), corrected_similarity.argmax().item()
                if corrected_score >= 0.5:  # Lowered threshold for user-corrected
                    debug_log.append({
                        'query': user_input,
                        'retrieval_score': corrected_score.item(),
                        'retrieval_match': corrected_corpus[corrected_index],
                        'source': 'user_corrected'
                    })
                    return user_corrected.iloc[corrected_index]['Response'], 'retrieval'
            except Exception as e:
                debug_log.append(f"User-corrected retrieval error: {str(e)}")
                print(f"Error in user-corrected retrieval: {e}")

        # Main Dataframe retrieval
        try:
            if corpus_embeddings is None or len(corpus_embeddings) == 0:
                debug_log.append("Corpus embeddings empty or not initialized")
                raise ValueError("Corpus embeddings not initialized")
            similarity = util.pytorch_cos_sim(query_embedding, corpus_embeddings)
            score, index = similarity.max(), similarity.argmax().item()
            debug_log.append({
                'query': user_input,
                'retrieval_score': score.item(),
                'retrieval_match': df.iloc[index]['Greeting'],
                'source': 'retrieval'
            })
            if score >= 0.7:  # Lowered threshold for better recall
                return df.iloc[index]['Response'], 'retrieval'
        except Exception as e:
            debug_log.append(f"Main retrieval error: {str(e)}")
            print(f"Error in main retrieval: {e}")

        # RAG Fall-Back: Token/Keyword-Based Retrieval
        if rag_segments:
            try:
                rag_embeddings = model.encode(rag_segments, convert_to_tensor=True)
                rag_similarity = util.pytorch_cos_sim(query_embedding, rag_embeddings)
                keywords = ['programming', 'code', 'python', 'javascript', 'function', 'loop', 'variable', 'class', 'debug', 'algorithm']
                for keyword in keywords:
                    if keyword in user_input_clean.lower():
                        keyword_indices = [i for i, seg in enumerate(rag_segments) if keyword in seg.lower()]
                        for idx in keyword_indices:
                            rag_similarity[0][idx] *= 1.05  # Increased boost for keywords
                rag_score, rag_index = rag_similarity.max(), rag_similarity.argmax().item()
                top_indices = rag_similarity[0].argsort(descending=True)[:3]
                top_matches = [{'score': rag_similarity[0][i].item(), 'match': rag_segments[i][:50]} for i in top_indices]
                debug_log.append({
                    'query': user_input,
                    'rag_score': rag_score.item(),
                    'rag_match': rag_segments[rag_index][:50],
                    'top_matches': top_matches,
                    'source': 'rag'
                })
                if rag_score >= 0.5:  # Lowered threshold for RAG
                    if not silent:
                        print("ProgBot debugging your question...")
                    response = rag_segments[rag_index][:200]
                    return response, 'rag'
            except Exception as e:
                debug_log.append(f"RAG retrieval error: {str(e)}")
                print(f"Error in RAG retrieval: {e}")

        # Log unknown query and suggest rephrasing
        debug_log.append(f"No match found for query: {user_input}")
        return "I couldn't find a clear answer. Try rephrasing your programming question!", "no_match"
    except Exception as e:
        debug_log.append(f"General retrieval error: {str(e)}")
        return "An error occurred. Please ask another programming question.", "error"

## Function-based Error Handling

In [None]:
#Initialising Unknown Questions Log
unknown_log = []

In [None]:
# Initialising Feedback Log
feedback_log = []

In [None]:
#Intialising Files
log_file = 'new_training_data.json'
feedback_file = 'feedback_log.json'
debug_file = 'debug_log.json'

## Dynamic Learning

In [None]:
#Defining Function for Database Expansion and Dynamic Learning
def log_unknown(user_input):
    global df, corpus, corpus_embeddings
    print("Sorry, I don't know the answer to that one, feel free to provide the correct response or exit the chat.")
    response = input("Your response (press Enter to skip): ").strip()
    if response:
        new_entry = {
            'Greeting': user_input,
            'Response': response,
            'Category': 'UserGenerated',
            'Time': str(datetime.now()),
            'RespLen': len(response),
            'Flagged': len(response) < 20
        }
        unknown_log.append(new_entry)
        try:
            with open(log_file, 'a') as f:
                f.write(json.dumps(new_entry) + '\n')
            df.loc[len(df)] = new_entry
            corpus.append(clean_text(user_input))
            new_embedding = model.encode(clean_text(user_input), convert_to_tensor=True)
            corpus_embeddings = torch.cat([corpus_embeddings, new_embedding.unsqueeze(0)])
            print("✅ Added user response to retrieval database.")
            retrain_from_log()
        except Exception as e:
            print(f"Error updating database: {e}")
        return response
    return None

In [None]:
#Defining Function to Save Log Feedback
def save_feedback():
    try:
        with open(feedback_file, 'w') as f:
            json.dump(feedback_log, f, indent=2)
    except Exception as e:
        print(f"Error saving feedback: {e}")

In [None]:
#Defining Function to Save Debug Log
def save_debug_log():
    try:
        with open(debug_file, 'w') as f:
            json.dump(debug_log, f, indent=2)
    except Exception as e:
        print(f"Error saving debug log: {e}")

In [None]:
#Defining Function to Retrain Using Logs
def retrain_from_log():
    global df, corpus, corpus_embeddings
    if not os.path.exists(log_file):
        print(f"⚠️ Log file {log_file} not found. Creating empty log file.")
        try:
            with open(log_file, 'w') as f:
                f.write('')
        except Exception as e:
            print(f"Error creating log file: {e}")
        return
    try:
        with open(log_file, 'r') as f:
            lines = f.readlines()
        if not lines:
            return
        new_data = [json.loads(line) for line in lines]
        new_df = pd.DataFrame(new_data)
        df = pd.concat([df, new_df], ignore_index=True)
        corpus = df['Greeting'].apply(clean_text).tolist()
        corpus_embeddings = model.encode(corpus, convert_to_tensor=True, show_progress_bar=False)
        print("✅ Retrained. Total Q&A pairs:", len(corpus_embeddings))
    except Exception as e:
        print(f"Error during retraining: {e}")

## Threshold Optimisation

In [None]:
#Defining Function to Optimise Threshold
def optimize_threshold(test_cases, thresholds=[0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
    best_threshold = 0.8
    best_score = 0
    test_cases = test_cases + [
            {'input': 'what is a variable', 'expected': 'stores data'},
            {'input': 'what is a loop', 'expected': 'repeats code'},
            {'input': 'what is python', 'expected': 'programming language'}
    ]
    failed_cases = []
    for t in thresholds:
        score = 0
        current_failed = []
        for case in test_cases:
            result, _ = get_response(case['input'], threshold=t, silent=True)
            if result and case['expected'].lower() in result.lower():
                score += 1
            else:
                current_failed.append({'input': case['input'], 'expected': case['expected'], 'result': result})
        accuracy = score / len(test_cases)
        if accuracy > best_score:
            best_score = accuracy
            best_threshold = t
            failed_cases = current_failed
    if failed_cases:
        debug_log.append({'optimize_failed_cases': failed_cases})
    return best_threshold

In [None]:
# Showing Cosine Similarity Scores
def plot_similarity_scores(query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0].cpu().numpy()

    top_indices = np.argsort(cosine_scores)[-10:][::-1]
    labels = [corpus[i][:40] + '...' for i in top_indices]
    scores = [cosine_scores[i] for i in top_indices]

    plt.figure(figsize=(10, 5))
    plt.barh(labels, scores)
    plt.xlabel("Cosine Similarity")
    plt.title(f"Top 10 Matches for: '{query}'")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Example usage:
plot_similarity_scores("how to register for classes")

# ProgBot

In [None]:
#ChatBot Function
def chat():
    global df, corpus, corpus_embeddings
    print("Hi! I'm your ProgBot. Ask anything about general programming. Type 'exit' to quit.")
    keywords = ["programming", "code", "python", "javascript", "function", "loop", "variable", "class", "debug", "algorithm"]
    test_cases = [
    {'input': 'hello', 'expected': 'Hello, coder!'},
    {'input': 'what is a function', 'expected': 'reusable code block'},
    {'input': 'how to write a loop', 'expected': 'repeats code'}
]
    threshold = optimize_threshold(test_cases)

    while True:
        user_input = input("You: ")
        if not user_input.strip():
            print("Please ask a question.")
            continue
        if user_input.lower() == '!help':
            print("Commands:\n- !help: Show commands\n- !examples: Show examples\n- !stats: Show feedback stats\n- !debug: Show debug log\n- exit: Quit")
            continue
        if user_input.lower() == '!examples':
            print("Examples:\n- how to apply for a loan\n- what is APR?\n- hello")
            continue
        if user_input.lower() == '!stats':
            yes = sum(1 for x in feedback_log if x['Feedback'] == 'yes')
            no = sum(1 for x in feedback_log if x['Feedback'] == 'no')
            total = yes + no
            print(f"Helpful: {yes}, Not helpful: {no}, Helpfulness: {yes/total*100:.2f}%" if total else "No feedback yet.")
            questions = [entry['Q'] for entry in feedback_log]
            print("Top 5 questions:", Counter(questions).most_common(5))
            continue
        if user_input.lower() == '!debug':
            save_debug_log()
            print(f"Debug log saved to {debug_file}. Last 5 entries:")
            for entry in debug_log[-5:]:
                print(entry)
            continue
        if user_input.lower() == 'exit':
            confirm = input("Are you sure you want to exit? (yes/no): ")
            if confirm.lower() == 'yes':
                save_feedback()
                save_debug_log()
                df.to_csv('Cleaned_ProgrammingDataset.csv', index=False)
                with open('unknowns_backup.json', 'w') as f:
                    json.dump(unknown_log, f, indent=2)
                break
            continue

        response, source = get_response(user_input, threshold)
        if response:
            print("ProgBot:", response)
            if any(word in user_input.lower() for word in keywords):
                feedback = input("Was this helpful? (yes/no, or provide clarification): ").strip().lower()
                if feedback not in ['yes', 'no']:
                    print("Treating your input as a clarification.")
                    new_entry = {
                        'Greeting': user_input,
                        'Response': feedback,
                        'Category': 'UserCorrected',
                        'Time': str(datetime.now()),
                        'RespLen': len(feedback),
                        'Flagged': len(feedback) < 20
                    }
                    unknown_log.append(new_entry)
                    try:
                        with open(log_file, 'a') as f:
                            f.write(json.dumps(new_entry) + '\n')
                        df.loc[len(df)] = new_entry
                        corpus.append(clean_text(user_input))
                        new_embedding = model.encode(clean_text(user_input), convert_to_tensor=True)
                        corpus_embeddings = torch.cat([corpus_embeddings, new_embedding.unsqueeze(0)])
                        print("✅ Added clarification to retrieval database.")
                        retrain_from_log()
                    except Exception as e:
                        print(f"Error updating database: {e}")
                    feedback_log.append({'Q': user_input, 'A': response, 'Feedback': 'corrected', 'Source': source})
                    print(f"Logged correction: {feedback}")
                else:
                    feedback_log.append({'Q': user_input, 'A': response, 'Feedback': feedback, 'Source': source})
                    if feedback == 'no':
                        correction = input("Would you like to provide the correct answer or exit? (answer/exit): ").strip().lower()
                        if correction == 'answer':
                            correct_answer = input("Please provide the correct answer: ").strip()
                            if correct_answer:
                                new_entry = {
                                    'Greeting': user_input,
                                    'Response': correct_answer,
                                    'Category': 'UserCorrected',
                                    'Time': str(datetime.now()),
                                    'RespLen': len(correct_answer),
                                    'Flagged': len(correct_answer) < 20
                                }
                                unknown_log.append(new_entry)
                                try:
                                    with open(log_file, 'a') as f:
                                        f.write(json.dumps(new_entry) + '\n')
                                    df.loc[len(df)] = new_entry
                                    corpus.append(clean_text(user_input))
                                    new_embedding = model.encode(clean_text(user_input), convert_to_tensor=True)
                                    corpus_embeddings = torch.cat([corpus_embeddings, new_embedding.unsqueeze(0)])
                                    print("✅ Added correction to retrieval database.")
                                    retrain_from_log()
                                except Exception as e:
                                    print(f"Error updating database: {e}")
                                feedback_log.append({'Q': user_input, 'A': correct_answer, 'Feedback': 'corrected', 'Source': 'user'})
                        elif correction == 'exit':
                            print("Exiting feedback loop. You can continue asking questions or type 'exit' to quit.")
                    elif feedback == 'yes' and source == 'rag':
                        new_entry = {
                            'Greeting': user_input,
                            'Response': response,
                            'Category': 'RAGGenerated',
                            'Time': str(datetime.now()),
                            'RespLen': len(response),
                            'Flagged': len(response) < 20
                        }
                        try:
                            df.loc[len(df)] = new_entry
                            corpus.append(clean_text(user_input))
                            new_embedding = model.encode(clean_text(user_input), convert_to_tensor=True)
                            corpus_embeddings = torch.cat([corpus_embeddings, new_embedding.unsqueeze(0)])
                            print("✅ Added RAG response to retrieval database.")
                            retrain_from_log()
                        except Exception as e:
                            print(f"Error updating database: {e}")
        else:
            response = log_unknown(user_input)
            if response:
                feedback_log.append({'Q': user_input, 'A': response, 'Feedback': 'unknown', 'Source': 'user'})


In [None]:
#Dynamic Retraining
try:
    retrain_from_log()
    debug_log.append("Dynamic retraining initiated successfully")
    # Challenge: Frequent retraining with large logs may cause performance bottlenecks; consider scheduling or batch processing
except Exception as e:
    debug_log.append(f"Error during dynamic retraining: {str(e)}")
    print(f"Error during dynamic retraining: {e}. Skipping retraining.")

In [None]:
#Main
try:
    chat()
    debug_log.append("Chatbot session started successfully at " + str(datetime.now()))
    # Challenge: Long-running chat sessions may consume significant memory with large logs or embeddings
except Exception as e:
    debug_log.append(f"Error starting chatbot session: {str(e)}")
    print(f"Error starting chatbot: {e}. Please restart the application.")