In [None]:
import pandas as pd
import re
import json


# File path to your dataset
file_path = r"C:\\Users\\HP\\Downloads\\train.csv"

# Load the dataset
data = pd.read_csv(file_path)

# Function to clean text
def clean_text(text):
    # Remove special characters, multiple spaces, and make lowercase
    text = re.sub(r"[^a-zA-Z0-9?.!,\s]", "", str(text))
    text = re.sub(r"\s+", " ", text)
    return text.strip().lower()

# Clean the Question and Answer columns
data['Question'] = data['Question'].apply(clean_text)
data['Answer'] = data['Answer'].apply(clean_text)

# Prepare dataset for fine-tuning
fine_tuning_data = [
    {"question": row['Question'], "answer": row['Answer']}
    for _, row in data.iterrows()
]

# Save to JSON for fine-tuning
output_path = "C:\\Users\\HP\\Downloads\\new_fine_tuning_data.json"
with open(output_path, "w") as f:
    json.dump(fine_tuning_data, f, indent=4)


output_path = r"C:\Users\HP\Downloads\new_fine_tuning_data.json"
print(f"Fine-tuning dataset saved to {output_path}")


  '''


Fine-tuning dataset saved to C:\Users\HP\Downloads\new_fine_tuning_data.json


In [17]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import json

# Step 1: Load Dataset
json_path = "C:/Users/HP/Downloads/dataset.json"
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Flatten the dataset to extract 'question' and 'answer' columns
questions = data['questions']
data_dict = {
    'question': [item['question'] for item in questions],
    'answer': [item['answer'] for item in questions]
}

# Create the dataset from the data
dataset = Dataset.from_dict(data_dict)

# Step 2: Load Pre-trained Model and Tokenizer
model_name = "t5-base"  # t5 base for best results
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Step 3: Preprocess Data
def preprocess_data(examples):
    inputs = [f"question: {q}" for q in examples["question"]]
    targets = [f"answer: {a}" for a in examples["answer"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_data = dataset.map(preprocess_data, batched=True)

# Split into training and validation datasets
data_split = tokenized_data.train_test_split(test_size=0.1, seed=42)
train_dataset = data_split["train"]
eval_dataset = data_split["test"]

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save checkpoints
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    learning_rate=2e-5,                   # Use a lower learning rate for quicker training
    per_device_train_batch_size=8,        # Batch size for training 
    per_device_eval_batch_size=8,         # Batch size for evaluation
    num_train_epochs=50,                   # Reduce epochs for faster results
    save_steps=500,                       # Save checkpoint every 500 steps
    save_total_limit=1,                   # Save only the most recent checkpoint
    logging_dir="./logs",                  # Directory for logs
    logging_steps=50,                     # Log every 50 steps
    fp16=torch.cuda.is_available(),       
    dataloader_num_workers=2              # Fewer workers for faster data loading
)

# Step 5: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

# Step 6: Train the Model
print("Training the model...")
trainer.train()

# Step 7: Save the Fine-Tuned Model
model.save_pretrained("./new_fine_tuned_model")
tokenizer.save_pretrained("./new_fine_tuned_model")
print("Model fine-tuned and saved!")




Map:   0%|          | 0/79 [00:00<?, ? examples/s]

  trainer = Trainer(


Training the model...


Epoch,Training Loss,Validation Loss
1,No log,9.934616
2,No log,4.886226
3,No log,1.577015
4,No log,0.762497
5,No log,0.723395
6,5.400100,0.703343
7,5.400100,0.663743
8,5.400100,0.600286
9,5.400100,0.545678
10,5.400100,0.505268


Model fine-tuned and saved!


In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the model and tokenizer

model_path = "C:/Users/HP/Downloads/new_fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [8]:
import tkinter as tk
from tkinter import scrolledtext, messagebox
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sentence_transformers import SentenceTransformer, util
import heapq
import json
from nltk.corpus import wordnet
import nltk
import threading
import time

# Ensure NLTK data is downloaded
nltk.download("wordnet")

# Load Fine-Tuned T5 Model
model_path = "./new_fine_tuned_model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Load Pre-trained SentenceTransformer for Semantic Similarity
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Dataset file location
dataset_file = r"C:\Users\HP\Downloads\dataset.json"

# Function to load the knowledge base
def load_knowledge_base(dataset_file):
    """Load the knowledge base dynamically from a JSON file."""
    with open(dataset_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if 'questions' in data and isinstance(data['questions'], list) and all('question' in entry and 'answer' in entry for entry in data['questions']):
        return data['questions']
    else:
        raise ValueError("The dataset format is incorrect. It must have a 'questions' key with a list of dictionaries containing 'question' and 'answer' fields.")

# Function: Generate Initial Response using T5
def generate_response(question, model, tokenizer, max_length=50):
    inputs = tokenizer(f"question: {question}", return_tensors="pt", truncation=True, max_length=256)
    outputs = model.generate(inputs["input_ids"], max_length=max_length, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function: A* Search for Knowledge Retrieval
def a_star_search(query, dataset_file, threshold=0.5):
    knowledge_base = load_knowledge_base(dataset_file)
    questions = [entry["question"] for entry in knowledge_base]
    kb_embeddings = embedding_model.encode(questions, convert_to_tensor=True)
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding / query_embedding.norm()  # Normalize query embedding

    pq = []
    for idx, entry in enumerate(knowledge_base):
        g = util.pytorch_cos_sim(query_embedding, kb_embeddings[idx])[0][0].item()
        if g >= threshold:
            f = 1 - g  # Cost function
            heapq.heappush(pq, (f, idx, entry))

    if not pq:
        return "No relevant match found in the knowledge base."
    
    _, _, best_entry = heapq.heappop(pq)
    return best_entry["answer"]



def generate_variations(response):
    words = response.split()
    variations = []
    
    # List of excluded words/phrases to preserve
    excluded_words = [
        'customer', 'support', 'team', 'item', 'order', 'return', 'ship', 'contact', 'assist', 'necessary', 'steps'
    ]

    for i, word in enumerate(words):
        if word.lower() in excluded_words:  # Skip excluded words
            continue
        
        synonyms = wordnet.synsets(word)
        if synonyms:
            for synonym in synonyms[:3]:  # Limit to top 3 synonyms
                new_word = synonym.lemmas()[0].name()
                # Ensure the new word is different and makes sense in context
                if new_word != word and new_word not in excluded_words:
                    new_words = words[:]
                    new_words[i] = new_word  # Replace the word with the synonym
                    variation = " ".join(new_words)
                    
                    # Check semantic similarity to ensure context
                    variation_score = util.pytorch_cos_sim(
                        embedding_model.encode(response, convert_to_tensor=True),
                        embedding_model.encode(variation, convert_to_tensor=True)
                    )[0][0].item()
                    
                    if variation_score > 0.8:  # Accept only contextually similar variations
                        variations.append(variation)
    return variations

def hill_climbing(query, response1, response2, embedding_model, max_iterations=10):
    def score(response):
        query_embedding = embedding_model.encode(query, convert_to_tensor=True)
        response_embedding = embedding_model.encode(response, convert_to_tensor=True)
        return util.pytorch_cos_sim(query_embedding, response_embedding)[0][0].item()

    response1_score = score(response1)
    response2_score = score(response2)

    if response1_score >= response2_score:
        current_response = response1
        current_score = response1_score
    else:
        current_response = response2
        current_score = response2_score

    for _ in range(max_iterations):
        neighbors = generate_variations(current_response)
        best_neighbor = current_response
        best_score = current_score
        
        for neighbor in neighbors:
            neighbor_score = score(neighbor)
            if neighbor_score > best_score:
                best_neighbor = neighbor
                best_score = neighbor_score

        if best_score > current_score:
            current_response = best_neighbor
            current_score = best_score
        else:
            break

    return current_response


def chatbot_pipeline(user_query, dataset_file, model, tokenizer):
    retrieved_response = a_star_search(user_query, dataset_file, threshold=0.5)
    if not retrieved_response:
        retrieved_response = "No relevant match found in the knowledge base."

    fine_tuned_response = generate_response(user_query, model, tokenizer)
    optimized_response = hill_climbing(user_query, retrieved_response, fine_tuned_response, embedding_model)

    return {
        "retrieved_response": retrieved_response,
        "fine_tuned_response": fine_tuned_response,
        "optimized_response": optimized_response,
    }


def create_gui():
    window = tk.Tk()
    window.title("E-Commerce Chatbot")

    # Header area
    header = tk.Label(window, text="Welcome to E-Commerce Chatbot", font=("Helvetica", 16, "bold"), bg="#4CAF50", fg="white", padx=10, pady=10)
    header.pack(fill=tk.X)

    # Chat history area
    chat_history = scrolledtext.ScrolledText(window, width=80, height=20, wrap=tk.WORD, font=("Arial", 12), bg="#f0f0f0")
    chat_history.pack(padx=10, pady=10)
    chat_history.config(state=tk.DISABLED)

    # User input area
    user_input_frame = tk.Frame(window)
    user_input_frame.pack(pady=10)

    user_input = tk.Entry(user_input_frame, width=70, font=("Arial", 12))
    user_input.grid(row=0, column=0, padx=10)

    # Dynamic status label
    status_label = tk.Label(window, text="", font=("Arial", 10), fg="blue")
    status_label.pack()

    def update_status(message):
        status_label.config(text=message)
        window.update_idletasks()

    # Function to handle user input and responses
    def send_message():
        user_message = user_input.get()
        if not user_message:
            messagebox.showwarning("Input Error", "Please enter a query!")
            return

        if user_message.lower() == "exit":
            if messagebox.askyesno("Exit", "Are you sure you want to exit?"):
                window.quit()
            return

        # Display user message
        chat_history.config(state=tk.NORMAL)
        chat_history.insert(tk.END, f"You: {user_message}\n")
        chat_history.config(state=tk.DISABLED)
        chat_history.yview(tk.END)

        user_input.delete(0, tk.END)

        # Background task for chatbot pipeline
        def process_response():
            update_status("Processing your query...")
            time.sleep(0.5)  # Simulate loading time

            result = chatbot_pipeline(user_message, dataset_file, model, tokenizer)

            chat_history.config(state=tk.NORMAL)
            chat_history.insert(tk.END, "--- Retrieved Response from Knowledge (A*) ---\n")
            chat_history.insert(tk.END, result["retrieved_response"] + "\n\n")
            chat_history.insert(tk.END, "--- Generated Response from Fine-Tuned Model ---\n")
            chat_history.insert(tk.END, result["fine_tuned_response"] + "\n\n")
            chat_history.insert(tk.END, "--- Final Optimized Response ---\n")
            chat_history.insert(tk.END, result["optimized_response"] + "\n\n")
            chat_history.config(state=tk.DISABLED)
            chat_history.yview(tk.END)

            update_status("Response generated successfully!")

        threading.Thread(target=process_response).start()

    # Buttons
    send_button = tk.Button(user_input_frame, text="Send", width=15, font=("Arial", 12), bg="#4CAF50", fg="white", command=send_message)
    send_button.grid(row=0, column=1, padx=10)

    exit_button = tk.Button(window, text="Exit", width=10, font=("Arial", 12), bg="#f44336", fg="white", command=lambda: window.quit() if messagebox.askyesno("Exit", "Are you sure you want to exit?") else None)
    exit_button.pack(pady=10)

    window.mainloop()

# Start the GUI chat
create_gui()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
