In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Complete setup for Google Colab
!pip install -q transformers peft sentence-transformers faiss-cpu accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
import pandas as pd
import time
import pickle
import faiss
from sentence_transformers import SentenceTransformer
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

print("="*60)
print("End-to-End Pipeline System")
print("="*60)

# Build complete pipeline class
class HybridChatbot:
    def __init__(self):
        print("\n1. Loading all components...")

        # Classifier
        print("   - Loading classifier...")
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/logistic_regression.pkl', 'rb') as f: # upload this in your drive
            self.classifier = pickle.load(f)
        with open('/content/drive/MyDrive/NLP_Project/models/classifier/tfidf_vectorizer.pkl', 'rb') as f: # upload this in your drive
            self.tfidf = pickle.load(f)

        # Retrieval system
        print("   - Loading retrieval system...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.retrieval_index = faiss.read_index('/content/drive/MyDrive/NLP_Project/models/retrieval/faiss_index.bin') # upload this in your drive
        self.retrieval_data = pd.read_csv('/content/drive/MyDrive/NLP_Project/models/retrieval/deterministic_qa_pairs.csv') # upload this in your drive

        # LLM
        print("   - Loading fine-tuned LLM...")
        base_model = AutoModelForCausalLM.from_pretrained(
            "microsoft/phi-2",
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        self.llm_model = PeftModel.from_pretrained(
            base_model,
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model" # upload this in your drive
        )
        self.llm_tokenizer = AutoTokenizer.from_pretrained(
            "/content/drive/MyDrive/NLP_Project/checkpoints/phi2_lora/final_model" # upload this in your drive
        )

        print("   ✓ All components loaded!\n")

    def classify_query(self, query):
        """Returns 0 for deterministic, 1 for indeterministic"""
        query_tfidf = self.tfidf.transform([query])
        return self.classifier.predict(query_tfidf)[0]

    def retrieve_response(self, query, k=1):
        """Semantic search for deterministic queries"""
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        distances, indices = self.retrieval_index.search(query_embedding.astype('float32'), k)
        return self.retrieval_data.iloc[indices[0][0]]['response'], distances[0][0]

    def generate_response(self, query, max_tokens=150):
        """LLM generation for indeterministic queries"""
        prompt = f"Customer: {query}\nAssistant:"
        inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)

        with torch.no_grad():
            outputs = self.llm_model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=self.llm_tokenizer.eos_token_id
            )

        response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.split("Assistant:")[-1].strip()

    def respond(self, query):
        """Main pipeline: classify → route → respond"""
        start_time = time.time()

        # Step 1: Classify
        prediction = self.classify_query(query)
        route = "RETRIEVAL" if prediction == 0 else "LLM_GENERATION"

        # Step 2: Get response
        if prediction == 0:  # Deterministic
            response, distance = self.retrieve_response(query)
            confidence = 1.0 / (1.0 + distance)  # Convert distance to confidence
        else:  # Indeterministic
            response = self.generate_response(query)
            confidence = None

        latency = (time.time() - start_time) * 1000

        return {
            'query': query,
            'route': route,
            'response': response,
            'latency_ms': latency,
            'confidence': confidence
        }

End-to-End Pipeline System


In [4]:
# Initialize chatbot
chatbot = HybridChatbot()

# Test it
print(chatbot.respond("I'm unhappy with the order I received"))


1. Loading all components...
   - Loading classifier...
   - Loading retrieval system...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

   - Loading fine-tuned LLM...


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

   ✓ All components loaded!

{'query': "I'm unhappy with the order I received", 'route': 'LLM_GENERATION', 'response': "I'm sorry to hear that you're unhappy with the order you received. Your satisfaction is of utmost importance to us, and I'm here to assist you in resolving this issue. Could you please provide me with some specific details about the problems you encountered with the order? This will help me understand the situation better and find a suitable solution for you. Thank you for bringing this to our attention, and I assure you that we will do our best to address your concerns. How can I assist you further? \n\nPlease let me know if there's anything else I can do to help. I appreciate your patience and understanding. Together, we will work towards resolving this matter and ensuring your satisfaction.", 'latency_ms': 10384.51337814331, 'confidence': None}
