In [1]:
# parameters :
# chunking , different chunking for each type
# embedding modules
# reranking module



# ✅ 1. embed_documents(texts: List[str]) -> List[List[float]]
# Takes a list of strings (your text chunks).
# Returns a list of embedding vectors (each a list or NumPy array of floats).
# The number of output vectors must equal the number of input texts.
# ✅ 2. embed_query(text: str) -> List[float]

# Create task from the query and then retrieve based on it

# so you got to use the retrieve from task as a tool

# paraphrase-MiniLM-L6-v2 , Instructor-XL / Instructor-Large,  Instructor-XL / Instructor-Large
# retrieve answers in the first place
# cross-encoder/ms-marco-MiniLM-L-6-v2 => reranker for q - a relevance
# maybe use bge embeddings with BAAI/bge-reranker-large


# ACNE or dual encoder :
# E5-large-v2 → trained with explicit “query: ... passage: ...” format



# use a pipeline and loop again if not good answer?


# pipelines to try :
# ACNE dual encoders : Approximate Nearest Neighbor Negative Contrastive Estimation for Dense Text Retrieval (ANCE)”, RocketQA, GTR, E5 v2
# generate a search query : User Query → LLM reformulates ("Find passages describing X, not asking about it")  : Self-rag, RRF + LLM-guided retriever (2024
# assymetric encoder : different encoder for each of the query and the knowledge : ColBERT, TART, T5-embed, Dragon+ (2024)
# ColBERTv2 — represents documents as multiple token embeddings (instead of one mean vector), improving granularity.

# Bonus: You can use the LLM’s feedback to iteratively refine retrieval → this is how Self-RAG achieves much higher precision.

In [None]:
# E5
# we will use this one and if so
# have to add query before each query and a passage in the split docs

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/e5-small-v2")
texts = [
    "query: What is the capital of France?",
    "passage: Paris is the capital and most populous city of France.",
    "What are you doing here in France?"
]
embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

class E5Embedder:
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).tolist()
    def embed_query(self, text: str) -> List[float]:
        return model.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0].tolist()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Display results
print("Pairwise cosine similarity:\n", np.round(similarity_matrix, 3))

In [None]:
from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd
import torch

# Load model
tokenizer = TapasTokenizer.from_pretrained("google/tapas-large-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-large-finetuned-wtq")



In [None]:
model.save("models/tapas-large-finetuned-wtq")

In [9]:
# Example table
data = {
    "Team": ["Arsenal", "Chelsea", "Liverpool"],
    "Points": [80, 75, 70],
    "Matches": [38, 38, 38]
}
table = pd.DataFrame.from_dict(data)

# Question
query = "How many points did Arsenal score?"

# Tokenize
inputs = tokenizer(table=table, queries=query, return_tensors="pt")

# Forward pass
outputs = model(**inputs)
logits = outputs.logits
logits_agg = outputs.logits_aggregation

# Decode answer
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs, logits, logits_agg
)
answers = []
for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
        answers.append(table.iat[coordinates[0]])
print("Answer:", answers)

TypeError: expected string or bytes-like object

In [None]:
SAVE_DIR = "./models/zephyr-7b-alpha-local"
tokenizer.save_pretrained(SAVE_DIR)
model.save_pretrained(SAVE_DIR)

In [2]:
from utils import *

try:
    result = 10 / 0
except Exception as e:
    write_error_log("Division by zero in computation block", e)


In [None]:
MODEL_PATH = "./zephyr-7b-alpha-local"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto"
)

In [None]:
r = textualize("./docs/Global_education.csv")

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer, util

embedder = SentenceTransformer("intfloat/e5-small-v2")

# example cache
query_cache = {
    "What is the capital of France?": {
        "embedding": embedder.encode("What is the capital of France?", normalize_embeddings=True),
        "retrieved_docs": [{"id": "doc_12"}, {"id": "doc_4"}]
    }
}

def get_cached_query_result(new_query, threshold=0.9):
    new_emb = embedder.encode(new_query, normalize_embeddings=True)
    for cached_q, entry in query_cache.items():
        sim = util.cos_sim(new_emb, entry["embedding"])[0][0].item()
        if sim > threshold:
            print(f"Cache hit! Similar to: {cached_q} (sim={sim:.3f})")
            return entry["retrieved_docs"]
    return None


In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key="YOUR_KEY")
index = pc.Index("your-index-name")

# Retrieve vector and metadata by ID
response = index.fetch(ids=["doc_12", "doc_4"])
for id, record in response['vectors'].items():
    print(id, record['metadata'], record['values'])
for doc in results:
    print(doc.metadata)


In [None]:
{ "query": "What is the capital of France?", "retrieved_docs": [ {"id": "doc_12", "score": 0.92}, {"id": "doc_4", "score": 0.88} ] }

In [13]:
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Example query + passages
query = "What is a padding oracle attack?"
passages = [
    "A padding oracle attack exploits cryptographic padding schemes.",
    "An SQL injection attack targets database queries.",
]

# Compute scores
scores = model.predict([(query, passage) for passage in passages])
print(scores)
x = list(zip(passages, scores))
x

[ 8.584671  -3.5387917]


[('A padding oracle attack exploits cryptographic padding schemes.', 8.584671),
 ('An SQL injection attack targets database queries.', -3.5387917)]

In [16]:
scored_docs = sorted(x, key=lambda x: x[1], reverse=True)
l = [x[0] for x in scored_docs]
l

['A padding oracle attack exploits cryptographic padding schemes.',
 'An SQL injection attack targets database queries.']

In [None]:
# 3awzeen function lel pdfs or text files el zeyada
# 3awzeenha telisten kol ma 7ad ye upload ay pdf aw youtube link to the file uploader we te read and index the file content
# then 3awzeen ne5aly el stremlit app 1 for youtube and 1 for 

In [None]:
# next 
# finish the streamlit app
# 1 page for home and another page for chatbot that will have predefined pdfs to be shown and allow to updload another pdf to add to the pinecone
# the allow to ask questions and retrieve and add the reference
# to add :
# history and context
# caching
# check first by an llm whethter the question needs retrieval ro not
# evaluate the answer and its relevance to the reference and might rewrite the query