# Imports

In [20]:
import os
import re
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Data Loading

In [2]:
def load_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [3]:
def clean_text(text):
    # Remove extra newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

In [6]:
def chunk_text(text, max_tokens=100):
    """
    Splits text into chunks of ~max_tokens (approx. by word count for simplicity).
    In real scenarios, you'd use a tokenizer for precise token counts.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_tokens):
        chunk = ' '.join(words[i:i + max_tokens])
        chunks.append(chunk)
    return chunks

In [7]:
file_path = "sample.txt" 
text = load_txt(file_path)
text = clean_text(text)
chunks = chunk_text(text)

print(f"✅ Loaded {len(chunks)} chunks.")
print("🔹 First chunk preview:\n", chunks[0])

✅ Loaded 43 chunks.
🔹 First chunk preview:
 Hello! This is a sample text document created to test your Language Model project. As an AI assistant, I can help you manage your schedule and remind you of any upcoming meetings or tasks you have. Do you have any meetings today? Or perhaps you need help organizing your to-do list? If you have work to do, it's important to prioritize tasks based on deadlines and importance. I can help you create reminders for your work, so you never miss a deadline. Talking about preferences, do you like a particular type of cuisine or food? For example, I enjoy discussing


# Generate Embeddings + Build Vector DB with FAISS

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
# Convert each chunk into a vector
embeddings = model.encode(chunks, show_progress_bar=True)

# Check one embedding
print(f"Shape of one embedding: {embeddings[0].shape}")

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Shape of one embedding: (384,)


In [14]:
# Each vector has 384 dimensions for MiniLM model
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean distance

# Convert to numpy array and add to index
embedding_matrix = np.array(embeddings).astype('float32')
index.add(embedding_matrix)

print(f"✅ Added {index.ntotal} vectors to the FAISS index.")

✅ Added 43 vectors to the FAISS index.


In [15]:
# Simple Python list can work for now
chunk_id_to_text = {i: chunk for i, chunk in enumerate(chunks)}

# Semantic Search + LLM Answering

In [16]:
# ✅ Step 4a: Embed the question
def embed_query(question, model):
    return model.encode([question])[0]  # Single vector

In [17]:
# ✅ Step 4b: Use FAISS to find similar chunks
def search_faiss(query_vector, index, top_k=3):
    query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
    distances, indices = index.search(query_vector, top_k)
    return indices[0]  # Top-k chunk indices

In [18]:
# ✅ Step 4c: Build the prompt for the LLM
def build_prompt(question, chunk_indices, chunk_map):
    prompt = "You are an AI assistant. Use the following context to answer the question.\n\nContext:\n"
    for idx in chunk_indices:
        prompt += chunk_map[idx] + "\n"
    prompt += f"\nQuestion: {question}\nAnswer:"
    return prompt

# Load the LLM and Generate a Response

In [21]:
# ✅ Load the Phi-2 model
model_id = "microsoft/phi-2"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else torch.float32)
model.to(device)
model.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): LayerNorm((2560,), eps=1