In [1]:
pip install transformers sentence-transformers faiss-cpu pandas torch nltk


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.meta

In [2]:
import os
import pandas as pd
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# =============================================================================
# ✅ Step 1: Load Dataset (Keeping All Columns)
# =============================================================================
def load_dataset(csv_path):
    """Loads the full dataset without removing any columns."""
    df = pd.read_csv(csv_path)

    # ✅ Display dataset info
    print("\n✅ Dataset Loaded Successfully!")
    print(f"📊 Total Rows: {df.shape[0]}")
    print(f"📊 Columns: {list(df.columns)}\n")

    return df

# =============================================================================
# ✅ Step 2: Build FAISS Index for Fast Retrieval
# =============================================================================
def build_faiss_index(df):
    """Builds a FAISS index using input_text for chatbot retrieval."""
    model = SentenceTransformer("all-MiniLM-L6-v2")
    input_texts = df["input_text"].tolist()

    # ✅ Convert input_texts to embeddings
    embeddings = model.encode(input_texts, convert_to_numpy=True)

    # ✅ Create FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)

    print("✅ FAISS index built successfully.")
    return index, model, input_texts

# =============================================================================
# ✅ Step 3: Model Response Generator Function
# =============================================================================
def model_response_generator(model, tokenizer, user_query):
    """Generate a response from the fine-tuned model given a user input."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(user_query, return_tensors="pt", truncation=True, padding=True).to(device)

    generated_ids = model.generate(
        inputs["input_ids"],
        max_new_tokens=64,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return response if response else "🤖 Sorry, I didn't understand that."

# =============================================================================
# ✅ Step 4: Test FAISS Model Before Saving
# =============================================================================
def test_faiss_chatbot(faiss_index, embedding_model, dataset, df, model, tokenizer):
    """Test FAISS chatbot before saving the model."""
    print("\n💬 Chatbot Ready for Testing! Type 'exit' to stop.")

    while True:
        user_input = input("\n👤 You: ")
        if user_input.lower() == "exit":
            print("👋 Exiting Chatbot Testing.")
            break

        # ✅ Convert user query to embedding
        user_embedding = embedding_model.encode([user_input], convert_to_numpy=True)

        # ✅ Search for the closest match in FAISS index
        distances, index = faiss_index.search(user_embedding, 1)  # Retrieve top-1 match
        similarity_score = 1 - (distances[0][0] / 100)  # Convert L2 distance to similarity (approximation)
        matched_question = dataset[index[0][0]]

        # ✅ Retrieve the corresponding response
        matched_row = df[df["input_text"] == matched_question].iloc[0]
        response = matched_row["output_text"]
        customer_name = matched_row.get("Customer Name", "Unknown")
        product_name = matched_row.get("product_name", "N/A")
        product_description = matched_row.get("product_description", "No description available.")

        # ✅ If similarity is below 80%, use model-generated response
        if similarity_score < 0.80:
            print("\n🤖 I couldn't find a strong match. Generating a response with AI...")
            response = model_response_generator(model, tokenizer, user_input)

        print(f"""
        👤 **Customer Name:** {customer_name}
        📦 **Product:** {product_name}
        📝 **Description:** {product_description}
        🤖 **Response:** {response}
        """)

# =============================================================================
# ✅ Step 5: Run FAISS Training & Testing
# =============================================================================
if __name__ == "__main__":
    # ✅ File Paths
    csv_path = "/content/full_chatbot_data.csv"
    model_path = "/content/Fine_Tuned"

    # ✅ Load Dataset
    df = load_dataset(csv_path)

    # ✅ Build FAISS Index
    faiss_index, embedding_model, input_texts = build_faiss_index(df)

    # ✅ Load Fine-Tuned Model
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # ✅ Test FAISS Chatbot Before Saving
    test_faiss_chatbot(faiss_index, embedding_model, input_texts, df, model, tokenizer)

    # ✅ Optionally Save Everything After Testing (Uncomment Below)
    # df.to_csv("final_chatbot_data.csv", index=False)
    # faiss.write_index(faiss_index, "faiss_chatbot_index.index")
    # embedding_model.save("sentence_transformer_model")
    # tokenizer.save_pretrained("saved_chatbot_model")
    # model.save_pretrained("saved_chatbot_model")
    # print("🚀 FAISS Model & Chatbot Model are ready for use in Streamlit UI!")



✅ Dataset Loaded Successfully!
📊 Total Rows: 100000
📊 Columns: ['Ticket ID', 'Customer Name', 'Customer Email', 'Customer Age', 'Customer Gender', 'category', 'product_name', 'product_description', 'input_text', 'output_text']



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ FAISS index built successfully.

💬 Chatbot Ready for Testing! Type 'exit' to stop.

👤 You: do you offer free shipping

        👤 **Customer Name:** Trevor Lawson
        📦 **Product:** International Delivery
        📝 **Description:** Global shipping with customs clearance
        🤖 **Response:** standard shipping takes 57 days
        

👤 You: where can i download my receipt

        👤 **Customer Name:** Andrea Mills
        📦 **Product:** Microsoft 365
        📝 **Description:** Subscription-based productivity suite
        🤖 **Response:** invoices can be downloaded from your account
        

👤 You: do you accept paypal

        👤 **Customer Name:** Nicole Reed
        📦 **Product:** Amazon Order
        📝 **Description:** E-commerce purchase with different payment options
        🤖 **Response:** your payment was received successfully
        

👤 You: exit
👋 Exiting Chatbot Testing.


In [3]:
import os
import faiss
import pandas as pd

# ✅ Define Folder Name
save_folder = "chatbot_model"
os.makedirs(save_folder, exist_ok=True)

# ✅ Save FAISS Index
faiss_index_path = os.path.join(save_folder, "faiss_chatbot_index.index")
faiss.write_index(faiss_index, faiss_index_path)
print(f"✅ FAISS index saved: {faiss_index_path}")

# ✅ Save Sentence Transformer Model (For FAISS Search)
sentence_transformer_path = os.path.join(save_folder, "sentence_transformer_model")
embedding_model.save(sentence_transformer_path)
print(f"✅ Sentence Transformer Model saved: {sentence_transformer_path}/")

# ✅ Save Fine-Tuned Chatbot Model
chatbot_model_path = os.path.join(save_folder, "saved_chatbot_model")
tokenizer.save_pretrained(chatbot_model_path)
model.save_pretrained(chatbot_model_path)
print(f"✅ Fine-Tuned Chatbot Model saved: {chatbot_model_path}/")

# ✅ (Optional) Save Final Dataset for UI Reference
dataset_path = os.path.join(save_folder, "final_chatbot_data.csv")
df.to_csv(dataset_path, index=False)
print(f"✅ Final dataset saved: {dataset_path}")

# ✅ Show Path to Use in Streamlit
print(f"\n🚀 All files are saved in: {os.path.abspath(save_folder)}")


✅ FAISS index saved: chatbot_model/faiss_chatbot_index.index
✅ Sentence Transformer Model saved: chatbot_model/sentence_transformer_model/
✅ Fine-Tuned Chatbot Model saved: chatbot_model/saved_chatbot_model/
✅ Final dataset saved: chatbot_model/final_chatbot_data.csv

🚀 All files are saved in: /content/chatbot_model


In [4]:
import shutil

# ✅ Define Folder Name
save_folder = "chatbot_model"
zip_filename = save_folder + ".zip"

# ✅ Zip the Folder
shutil.make_archive(save_folder, 'zip', save_folder)
print(f"✅ Folder zipped successfully: {zip_filename}")


✅ Folder zipped successfully: chatbot_model.zip
