In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn nltk


In [None]:
# Install required packages
!pip install transformers datasets torch pandas nltk

In [12]:
import os
import pandas as pd
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForCausalLM

nltk.download("punkt")

# =============================================================================
# âœ… Step 1: Clean the Dataset Before Processing
# =============================================================================
def clean_dataset(csv_path):
    """Cleans the dataset by removing unnecessary columns, duplicates, and missing values."""
    df = pd.read_csv(csv_path)

    # âœ… Keep only necessary columns
    required_columns = ["category", "product_name", "product_description", "input_text", "output_text"]
    df_cleaned = df[required_columns].copy()

    # âœ… Remove duplicate input_text entries (keeping the first occurrence)
    df_cleaned = df_cleaned.drop_duplicates(subset=["input_text"], keep="first")

    # âœ… Remove any rows with missing values in critical columns
    df_cleaned = df_cleaned.dropna(subset=["input_text", "output_text"])

    print(f"âœ… Dataset cleaned! {df_cleaned.shape[0]} rows available after cleaning.")
    
    return df_cleaned

# =============================================================================
# âœ… Step 2: Decision Tree Node Class
# =============================================================================
class DecisionTreeNode:
    def __init__(self, prompt, response=None):
        """Initialize a decision tree node."""
        self.prompt = prompt
        self.response = response  # Predefined response from dataset
        self.children = {}  # Dictionary mapping decisions to child nodes

    def add_child(self, decision, node):
        """Add a child node under the given decision."""
        self.children[decision.lower()] = node

    def traverse(self):
        """Recursively traverse the decision tree with improved user input handling."""
        print("\n" + self.prompt)

        while True:
            decision = input("\nðŸ‘¤ You: ").strip().lower()  # âœ… Now explicitly displays the userâ€™s question

            if decision == "exit":
                print("ðŸ‘‹ Exiting Chatbot. Goodbye!")
                return  # âœ… Ensures chatbot exits cleanly
            elif decision == "back":
                print("ðŸ”™ Returning to the main menu...")
                return  # âœ… Allows smooth return to main menu
            elif decision in self.children:
                print(f"\nðŸ‘¤ You: {decision}")  # âœ… Print user input before response
                print(f"ðŸ¤– {self.children[decision].response}")  # âœ… Displays bot response
            else:
                # âœ… Prevent generating a dynamic response if a dataset match exists
                if decision in dataset_responses:
                    print(f"\nðŸ‘¤ You: {decision}")  # âœ… Print user input before response
                    print(f"ðŸ¤– {dataset_responses[decision]}")
                else:
                    print(f"\nðŸ‘¤ You: {decision}")  # âœ… Print user input before response
                    print("ðŸ¤– I couldn't find an exact match. Let me generate a response for you...")
                    response = model_response_generator(model, tokenizer, decision)
                    print(f"ðŸ¤– {response}")

# =============================================================================
# âœ… Step 3: Model Response Generator Function
# =============================================================================
def model_response_generator(model, tokenizer, user_query):
    """Generate a response from the fine-tuned model given a user input."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(user_query, return_tensors="pt", truncation=True, padding=True).to(device)

    generated_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens=64,
        pad_token_id=tokenizer.eos_token_id  
    )
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return response if response else "ðŸ¤– Sorry, I didn't understand that."

# =============================================================================
# âœ… Step 4: Build Decision Tree from Cleaned CSV
# =============================================================================
def build_decision_tree_from_csv(df_cleaned):
    """Build a decision tree from the cleaned dataset CSV using input_text and output_text."""
    
    # âœ… Store all dataset responses for quick lookup
    global dataset_responses
    dataset_responses = {}

    # âœ… Ensure chatbot starts with a structured menu
    root_prompt = "ðŸ¤– Welcome to the Chatbot Decision System!\nAsk a question below:"
    root = DecisionTreeNode(root_prompt)

    for _, row in df_cleaned.iterrows():
        input_text, output_text = row["input_text"].strip().lower(), row["output_text"].strip()

        # âœ… Ensure chatbot provides dataset responses first
        dataset_responses[input_text] = output_text
        branch_node = DecisionTreeNode(f"ðŸ‘¤ {input_text}", response=output_text)
        root.add_child(input_text, branch_node)

    return root

# =============================================================================
# âœ… Step 5: Load Model, Clean Dataset, Build Tree, and Start Chatbot
# =============================================================================
model_path = r"C:\Users\siddh\Downloads\Master Thesis\Chatbot 2\fine_tuned_model"
csv_path = r"C:\Users\siddh\Downloads\Master Thesis\Chatbot 2\chatbot_data.csv"

# âœ… Load tokenizer correctly to avoid errors
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# âœ… Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# âœ… Clean the dataset before building the chatbot
df_cleaned = clean_dataset(csv_path)

# âœ… Build the decision tree using the cleaned dataset
decision_tree = build_decision_tree_from_csv(df_cleaned)

# âœ… Start chatbot interaction
decision_tree.traverse()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


âœ… Dataset cleaned! 60 rows available after cleaning.

ðŸ¤– Welcome to the Chatbot Decision System!
Ask a question below:

ðŸ‘¤ You: do you offer free shipping
ðŸ¤– standard shipping takes 57 days

ðŸ‘¤ You: where can i download my receipt
ðŸ¤– invoices can be downloaded from your account

ðŸ‘¤ You: can i cancel my order
ðŸ¤– to cancel visit your account settings or contact support

ðŸ‘¤ You: is there a student discount
ðŸ¤– you can upgrade your subscription in the account section

ðŸ‘¤ You: when will my product arrive
ðŸ¤– your order is on the way check tracking for updates
ðŸ‘‹ Exiting Chatbot. Goodbye!
