In [5]:
# Cell 1: Install dependencies with quiet mode
!pip install -q together chromadb rank-bm25 sentence-transformers pypdf2 pdfplumber nltk tqdm langchain python-dotenv

In [2]:
import os
os.environ["TOGETHER_API_KEY"] = ""


In [4]:
# Cell 2: NLTK setup with all required downloads
import nltk

# Download all required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Specifically download punkt_tab

True

In [5]:
# Cell 3: Import libraries and initialize components
import os
import re
import chromadb
import numpy as np
from tqdm import tqdm
import pdfplumber
from rank_bm25 import BM25Okapi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Initialize environment
load_dotenv()
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

AttributeError: partially initialized module 'torch' has no attribute 'fx' (most likely due to a circular import)

In [12]:
# Cell 4: Enhanced text cleaning functions
def clean_text(text):
    """Clean document text with normalization and filtering"""
    if not text:
        return ""

    # Basic normalization
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s.,;:!?()-]', '', text)  # Keep common punctuation

    # Advanced cleaning
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words

    # Token processing
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return ' '.join(tokens).strip()

def clean_query(user_input):
    """Clean user query with specialized normalization"""
    if not user_input:
        return ""

    # Preserve question structure
    user_input = re.sub(r'[^\w\s.,;:!?()-]', '', user_input)  # Remove special chars
    user_input = user_input.lower().strip()

    # Preserve key question words
    tokens = word_tokenize(user_input)
    question_words = {'who', 'what', 'when', 'where', 'why', 'how'}
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words or token in question_words]

    return ' '.join(tokens)

In [13]:
# Cell 5: PDF processing and text extraction
def extract_pages(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return [(i+1, page.extract_text()) for i, page in tqdm(enumerate(pdf.pages), desc="Extracting pages") if page.extract_text()]

pages = extract_pages("all_vitbot_data.pdf")
print(f"Extracted {len(pages)} pages")

Extracting pages: 246it [01:01,  4.02it/s]

Extracted 216 pages





In [14]:
# Cell 6: Text cleaning and preprocessing
cleaned_pages = [(num, clean_text(text)) for num, text in pages]
print("Sample cleaned text:", cleaned_pages[0][1][:100] + "...")

Sample cleaned text: academic regulation version . . preamble present day student much different student past many way . ...


In [15]:
# Cell 7: Text chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

chunks = []
for page_num, text in cleaned_pages:
    chunks.extend([(page_num, chunk) for chunk in splitter.split_text(text)])

print(f"Created {len(chunks)} chunks")
print("Sample chunk:", chunks[0][1][:100] + "...")

# Prepare data for indexing
chunk_texts = [chunk[1] for chunk in chunks]
metadata = [{"page_num": chunk[0]} for chunk in chunks]

Created 421 chunks
Sample chunk: academic regulation version . . preamble present day student much different student past many way . ...


In [16]:
# Cell 8 (Revised): Initialize retrieval systems with proper error handling
try:
    # Initialize embedding model
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')

    # Initialize ChromaDB
    chroma_client = chromadb.PersistentClient(path="/content/vibot_db")
    collection = chroma_client.get_or_create_collection(name="vibot_data")

    # Add documents to ChromaDB
    if chunk_texts:
        collection.add(
            documents=chunk_texts,
            metadatas=metadata,
            ids=[f"id_{i}" for i in range(len(chunk_texts))]
        )
        print(f"Indexed {len(chunk_texts)} documents in ChromaDB")
    else:
        print("Warning: No documents to index!")

    # Initialize BM25
    if chunk_texts:
        tokenized_corpus = [doc.split(" ") for doc in chunk_texts]
        bm25 = BM25Okapi(tokenized_corpus)
        print("BM25 initialized")
    else:
        print("Warning: No documents for BM25!")

except Exception as e:
    print(f"Error initializing retrieval systems: {str(e)}")
    # Create dummy objects to prevent future errors
    class DummyCollection:
        def query(self, **kwargs):
            return {'ids': [[]]}
    collection = DummyCollection()
    bm25 = None

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 32.4MiB/s]


Indexed 421 documents in ChromaDB
BM25 initialized


In [17]:
# Cell 9 (Revised): Enhanced hybrid retrieval with RRF and error handling
def hybrid_search(query, k=10):
    try:
        # Clean the query first
        cleaned_query = clean_query(query)
        if not cleaned_query:
            return []

        results = []

        # Vector search
        try:
            query_embedding = embed_model.encode(cleaned_query).tolist()
            vector_results = collection.query(
                query_embeddings=[query_embedding],
                n_results=k
            )
            vector_ids = [int(id.split('_')[1]) for id in vector_results['ids'][0]]
            results.extend(vector_ids)
        except Exception as e:
            print(f"Vector search error: {str(e)}")

        # BM25 search if initialized
        if bm25:
            try:
                tokenized_query = cleaned_query.split(" ")
                bm25_scores = bm25.get_scores(tokenized_query)
                bm25_indices = np.argsort(bm25_scores)[::-1][:k]
                results.extend(bm25_indices.tolist())
            except Exception as e:
                print(f"BM25 search error: {str(e)}")

        # If we have results, apply RRF fusion
        if results:
            # Use set to get unique indices
            unique_indices = set(results)

            # Simple ranking if we only have one set of results
            if len(unique_indices) == len(results):
                return list(unique_indices)[:k]

            # RRF fusion
            rrf_scores = {}
            for rank, idx in enumerate(results):
                rrf_scores[idx] = rrf_scores.get(idx, 0) + 1/(60 + rank + 1)

            # Sort by RRF score
            sorted_indices = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:k]
            return [idx for idx, _ in sorted_indices]

        return []

    except Exception as e:
        print(f"Hybrid search error: {str(e)}")
        return []

In [25]:
# Cell 1: Install latest Together AI package
!pip install -q together --upgrade

In [40]:
# Cell 2: Modern API setup
import together
import os

# Set API key properly (use Colab secrets or environment variable)
os.environ["TOGETHER_API_KEY"] = "tgp_v1_pfYs7skW5m9MM0j55YsqOkzPR0ILywU6qY4pRCzMilU"  # Replace with your actual key

# Initialize client (new recommended way)
client = together.Together(api_key=os.getenv("TOGETHER_API_KEY"))

# Verify connection
try:
    models = client.models.list()
    print("API connection successful! Available models:")
    print([m.id for m in models])
except Exception as e:
    print(f"API connection failed: {str(e)}")
    print("Please verify your API key is correct and active")

API connection successful! Available models:
['Alibaba-NLP/gte-modernbert-base', 'arcee-ai/AFM-4.5B-Preview', 'arcee-ai/arcee-blitz', 'arcee-ai/caller', 'arcee-ai/coder-large', 'arcee-ai/maestro-reasoning', 'arcee-ai/virtuoso-large', 'arcee-ai/virtuoso-medium-v2', 'arcee_ai/arcee-spotlight', 'BAAI/bge-base-en-v1.5-vllm', 'black-forest-labs/FLUX.1-canny', 'black-forest-labs/FLUX.1-depth', 'black-forest-labs/FLUX.1-dev', 'black-forest-labs/FLUX.1-dev-lora', 'black-forest-labs/FLUX.1-kontext-max', 'black-forest-labs/FLUX.1-kontext-pro', 'black-forest-labs/FLUX.1-pro', 'black-forest-labs/FLUX.1-redux', 'black-forest-labs/FLUX.1-schnell', 'black-forest-labs/FLUX.1-schnell-Free', 'black-forest-labs/FLUX.1.1-pro', 'cartesia/sonic', 'cartesia/sonic-2', 'deepseek-ai/DeepSeek-R1', 'deepseek-ai/DeepSeek-R1-0528-tput', 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B', 'deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free', 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', 'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B

In [46]:
# Cell 3: Updated response generation
def generate_response(user_input):
    try:
        # Clean input
        cleaned_input = clean_query(user_input)

        # Retrieve context (your existing RAG implementation)
        relevant_indices = hybrid_search(user_input, k=5)
        context = "\n\n".join([chunk_texts[idx] for idx in relevant_indices]) if relevant_indices else ""

        # Enhanced prompt template
        prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are Vibot, the official VIT college assistant. Rules:
        1. For identity questions: "I'm Vibot, VIT's AI assistant"
        2. For FFCS: "Fully Flexible Credit System (FFCS) is VIT's course registration system"
        3. Only answer using context below
        4. If unsure: "I don't have that information about VIT"

        Context: {context}<|eot_id|><|start_header_id|>user<|end_header_id|>
        {user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

        # Modern API call
        try:
            response = client.chat.completions.create(
                model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=512,
                temperature=0.3
            )
            return response.choices[0].message.content.strip()
        except together.AuthenticationError:
            return "Error: Invalid API key"
        except Exception as e:
            return f"API Error: {str(e)}"

    except Exception as e:
        return f"System Error: {str(e)}"

In [47]:
# Cell 11 (Revised): Robust chat interface with additional safeguards
print("Vibot: Hi! I'm Vibot, your VIT college assistant. How can I help you today?")
print("Vibot: Type 'exit' to end our conversation.\n")

while True:
    try:
        user_input = input("You: ").strip()
        if not user_input:
            print("Vibot: Please enter your question.")
            continue

        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Vibot: Goodbye! Have a great day.")
            break

        # Show cleaned version for debugging
        cleaned = clean_query(user_input)
        print(f"[System] Processed query: '{cleaned}'")

        response = generate_response(user_input)
        print(f"\nVibot: {response}\n")

    except KeyboardInterrupt:
        print("\nVibot: Session ended.")
        break
    except Exception as e:
        print(f"\nVibot: Error processing request - {str(e)}")

Vibot: Hi! I'm Vibot, your VIT college assistant. How can I help you today?
Vibot: Type 'exit' to end our conversation.

You: explain FFCS in detail
[System] Processed query: 'explain ffcs detail'

Vibot: Fully Flexible Credit System (FFCS) is VIT's course registration system. It is a student-centric system that allows students to plan their study at their own pace, choosing from multiple slots offered for each course. The FFCS academic regulation is applicable to students admitted from the academic year onwards, and the academic program under FFCS shall be decided by the Academic Council.

Key features of FFCS include:

1. **Flexible pacing**: Students can study at their own pace, choosing to fast-track or slow down their learning as needed.
2. **Multiple slot offerings**: Each course is offered in multiple slots, allowing students to choose the one that best fits their schedule.
3. **Student-centric**: The system is designed to be student-centric, focusing on experiential learning an