# Imports

In [19]:
import os
import re
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Data Loading

In [20]:
def load_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

In [21]:
def clean_text(text):
    # Remove extra newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

In [22]:
def chunk_text(text, max_tokens=100):
    """
    Splits text into chunks of ~max_tokens (approx. by word count for simplicity).
    In real scenarios, you'd use a tokenizer for precise token counts.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_tokens):
        chunk = ' '.join(words[i:i + max_tokens])
        chunks.append(chunk)
    return chunks

In [24]:
file_path = "DATA_POOL/sample_txt.txt" 
text = load_txt(file_path)
text = clean_text(text)
chunks = chunk_text(text)

print(f"✅ Loaded {len(chunks)} chunks.")
print("🔹 First chunk preview:\n", chunks[0])

✅ Loaded 43 chunks.
🔹 First chunk preview:
 Hello! This is a sample text document created to test your Language Model project. As an AI assistant, I can help you manage your schedule and remind you of any upcoming meetings or tasks you have. Do you have any meetings today? Or perhaps you need help organizing your to-do list? If you have work to do, it's important to prioritize tasks based on deadlines and importance. I can help you create reminders for your work, so you never miss a deadline. Talking about preferences, do you like a particular type of cuisine or food? For example, I enjoy discussing


# Generate Embeddings + Build Vector DB with FAISS

In [25]:
model_embedd = SentenceTransformer('all-MiniLM-L6-v2')

In [26]:
# Convert each chunk into a vector
embeddings = model_embedd.encode(chunks, show_progress_bar=True)

# Check one embedding
print(f"Shape of one embedding: {embeddings[0].shape}")

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Shape of one embedding: (384,)


In [27]:
# Each vector has 384 dimensions for MiniLM model
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)  # L2 = Euclidean distance

# Convert to numpy array and add to index
embedding_matrix = np.array(embeddings).astype('float32')
index.add(embedding_matrix)

print(f"✅ Added {index.ntotal} vectors to the FAISS index.")

✅ Added 43 vectors to the FAISS index.


In [36]:
# Simple Python list can work for now
chunk_id_to_text = {i: chunk for i, chunk in enumerate(chunks)}
chunk_id_to_text

{0: "Hello! This is a sample text document created to test your Language Model project. As an AI assistant, I can help you manage your schedule and remind you of any upcoming meetings or tasks you have. Do you have any meetings today? Or perhaps you need help organizing your to-do list? If you have work to do, it's important to prioritize tasks based on deadlines and importance. I can help you create reminders for your work, so you never miss a deadline. Talking about preferences, do you like a particular type of cuisine or food? For example, I enjoy discussing",
 1: "food preferences with users. Do you prefer spicy foods, sweet treats, or something savory? It's also useful to talk about your hobbies and interests. Do you enjoy reading books, watching movies, or perhaps playing video games? These insights help me personalize responses and suggestions better. Now, let's talk about health. Do you have a workout routine? Staying active is important for overall well-being, and I can sugges

# Semantic Search + LLM Answering

In [37]:
# ✅ Step 4a: Embed the question
def embed_query(question, model):
    return model.encode([question])[0]  # Single vector

In [38]:
# ✅ Step 4b: Use FAISS to find similar chunks
def search_faiss(query_vector, index, top_k=3):
    query_vector = np.array(query_vector).astype('float32').reshape(1, -1)
    distances, indices = index.search(query_vector, top_k)
    return indices[0]  # Top-k chunk indices

In [39]:
# ✅ Step 4c: Build the prompt for the LLM
def build_prompt(question, chunk_indices, chunk_map):
    prompt = "You are an AI assistant. Use the following context to answer the question.\n\nContext:\n"
    for idx in chunk_indices:
        prompt += chunk_map[idx] + "\n"
    prompt += f"\nQuestion: {question}\nAnswer:"
    return prompt

# Load the LLM and Generate a Response

In [40]:
# ✅ Load the Phi-2 model
model_id = "microsoft/phi-2"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16 if device=="cuda" else torch.float32)
model.to(device)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_layernorm): LayerNorm((2560,), eps=1

In [41]:
# ✅ Generate answer from prompt
def generate_answer(prompt, max_new_tokens=150):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.9
        )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer[len(prompt):].strip()  # Remove the prompt part from output


In [43]:
# 🔍 Sample question
question = "Can you help me with scheduling meetings?"

# 🔎 Embed + search + build prompt
query_vector = embed_query(question, model_embedd)
top_indices = search_faiss(query_vector, index, top_k=3)
print(top_indices)
final_prompt = build_prompt(question, top_indices, chunk_id_to_text)

# 💬 Generate LLM response
answer = generate_answer(final_prompt)
print("🤖 LLM Answer:\n", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[16 32  0]
🤖 LLM Answer:
 Sure! I can help you manage your schedule and create reminders for upcoming meetings. What type of meetings do you have scheduled?


In [17]:
data_dir = r"DATA_POOL"
for files in os.listdir(data_dir):
    file_path = os.path.join(data_dir,files)
    if os.path.isfile(file_path):
        ext = os.path.splitext(files)[1].lower()
        print(f"Found file: {files} (Type: {ext})")

Found file: sample_json.json (Type: .json)
Found file: sample_pdf.pdf (Type: .pdf)
Found file: sample_txt.txt (Type: .txt)
