In [None]:
!pip install pandas feedparser tqdm faiss-cpu symspellpy rapidfuzz openai==1.* scikit-learn

In [9]:
import os, re, json
import pandas as pd
import numpy as np
import faiss
from openai import OpenAI
from rapidfuzz import process, fuzz
import urllib.parse
import urllib.request
import feedparser
import csv


In [7]:
API = "http://export.arxiv.org/api/query"

QUERY = "machine learning"
csv_path = "arxiv_min.csv"

def fetch(start: int, max_results: int):
    params = {
        "search_query": f"all:{QUERY}",
        "start": start,
        "max_results": max_results,
    }
    url = f"{API}?{urllib.parse.urlencode(params)}"
    with urllib.request.urlopen(url, timeout=60) as r:
        data = r.read()
    return feedparser.parse(data)

In [10]:
records = []
start = 0
while len(records) < 350:
    feed = fetch(start, 100)
    entries = feed.entries
    if not entries:
        break
    for e in entries:
        title = (e.title or "").strip()
        summary = (getattr(e, "summary", "") or "").strip()
        try:
            authors = [a.name for a in e.authors]
        except Exception:
            authors = [getattr(e, "author", "")] if hasattr(e, "author") else []
        pub_date = getattr(e, "published", "")
        records.append({
            "title": title,
            "authors": ", ".join(authors),
            "summary": summary,
        })
    start += len(entries)

records = records[:350]

with open(csv_path, "w", encoding="utf-8", newline="") as f:
    w = csv.DictWriter(f, fieldnames=["title", "authors", "summary"])
    w.writeheader()
    w.writerows(records)

In [11]:
df = pd.read_csv(csv_path)
df = df.fillna({"title":"", "authors":"", "summary":""})
df = df[(df["summary"].str.strip()!="") & (df["title"].str.strip()!="")]
df = df.reset_index(drop=True)

In [12]:
df["len_words"] = df["summary"].apply(lambda x: len(str(x).split()))
print(df["len_words"].describe())

count    350.000000
mean     157.202857
std       53.237065
min       30.000000
25%      118.250000
50%      156.000000
75%      190.750000
max      439.000000
Name: len_words, dtype: float64


In [13]:
BASE_URL = "https://api.vsegpt.ru/v1/"
API_KEY = "sk-or-vv-7952300267ec5efa67eab60c3d6504cfa712007953e185d3440cc09767a6e503"
EMB_MODEL = "emb-openai/text-embedding-3-small"
CHAT_MODEL = "gpt-4o-mini"
CSV_IN = "arxiv_min.csv"
INDEX_PATH = "faiss.index"
META_PATH = "meta.json"
OUT_ANS = "answers.csv"

In [14]:
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

In [30]:
def word_chunks(text, chunk_words=150, overlap=15):
    ws = re.split(r"\s+", text.strip())
    ws = [w for w in ws if w]
    out, i = [], 0
    n = len(ws)
    while i < n:
        j = min(i+chunk_words, n)
        out.append(" ".join(ws[i:j]))
        if j == n: break
        i = j - overlap if j - overlap > 0 else j
    return out

In [16]:
CHUNK_WORDS = 150
OVERLAP = 15
TOP_K = 6
def build_chunks(df):
    rows = []
    for i, r in df.iterrows():
        for j, t in enumerate(word_chunks(r["summary"], CHUNK_WORDS, OVERLAP)):
            rows.append({"text": t, "doc_id": i, "chunk_id": j, "title": r["title"], "authors": r["authors"]})
    return rows

In [17]:
def vocab_from(df, meta, limit_words_per_chunk=60):
    def tok(s):
        s = s.lower()
        s = re.sub(r"[^a-z0-9а-яё\-]+", " ", s)
        return [w for w in s.split() if w]
    bag = set()
    for t in df["title"].tolist() + df["authors"].tolist():
        bag.update(tok(t))
    for m in meta:
        bag.update(tok(" ".join(m["text"].split()[:limit_words_per_chunk])))
    return sorted(bag)


In [18]:
def embed_texts(texts, batch=128):
    vecs = []
    for i in range(0, len(texts), batch):
        batch_texts = texts[i:i+batch]
        resp = client.embeddings.create(model=EMB_MODEL, input=batch_texts)
        vecs.append(np.array([d.embedding for d in resp.data], dtype=np.float32))
    X = np.vstack(vecs)
    X /= (np.linalg.norm(X, axis=1, keepdims=True) + 1e-12)
    return X

In [19]:
def build_faiss(X):
    idx = faiss.IndexFlatIP(X.shape[1])
    idx.add(X)
    return idx

In [20]:
def save_store(index, meta):
    faiss.write_index(index, INDEX_PATH)
    with open(META_PATH, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False)

In [21]:
def load_store():
    idx = faiss.read_index(INDEX_PATH)
    meta = json.load(open(META_PATH, "r", encoding="utf-8"))
    return idx, meta

In [22]:
def correct_query(q, vocab):
    toks = re.sub(r"[^a-z0-9а-яё\-]+", " ", q.lower()).split()
    v = vocab
    out = []
    for t in toks:
        if t in v:
            out.append(t)
        else:
            hit = process.extractOne(t, v, scorer=fuzz.QRatio, score_cutoff=85)
            out.append(hit[0] if hit else t)
    return " ".join(out) if out else q

In [23]:
def search(index, meta, query, k=TOP_K):
    qv = embed_texts([query])
    D, I = index.search(qv, k)
    res = []
    for rank, (idx, sc) in enumerate(zip(I[0].tolist(), D[0].tolist()), 1):
        if idx < 0: continue
        item = dict(meta[idx])
        item["rank"] = rank
        item["score"] = float(sc)
        res.append(item)
    return res

In [24]:
def chat(prompt):
    r = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[{"role":"user","content":prompt}],
        temperature=0.1,
        max_tokens=800
    )
    return r.choices[0].message.content.strip()

In [32]:
def answer_rag(q, hits):
    ctx = []
    for h in hits:
        piece = " ".join(h["text"].split()[:160])
        ctx.append(f"- {h['title']} — {h['authors']}\n{piece}")
    ctx = "\n\n".join(ctx)
    prompt = f"""
    You are a scientific assistant. Use THIS CONTEXT (fragments of arXiv abstracts) to answer.
    If something is not found in the context, say "not found in the provided corpus".
    Answer briefly, clearly, and include references to the article titles from the context.

    User question: {q}

    CONTEXT:
    {ctx}

    Instructions:
    - Provide key ideas and results, mention methods/datasets/metrics if available.
    - At the end, list up to 3 article titles and authors from the context that you based the answer on.
    """
    return chat(prompt)

In [35]:
if not (os.path.exists(INDEX_PATH) and os.path.exists(META_PATH)):
        meta = build_chunks(df)
        X = embed_texts([m["text"] for m in meta])
        index = build_faiss(X)
        save_store(index, meta)
index, meta = load_store()
vocab = vocab_from(df, meta)

queries = [
    "Federeted learnnng with differential privacy: key ideas and challenges",
    "Key concepts and architectures of transformer models in NLP",
    "Methods for improving robustness and calibration of deep neural networks",
    "Techniques for fine-tuning large language models",
    "Causal representation learning: main goals, methods, and common pitfalls"
]

for q in queries:
        q2 = correct_query(q, vocab)
        hits = search(index, meta, q2, TOP_K)
        a_rag = answer_rag(q, hits)

        print("Вопрос:", q)
        if q2 != q:
            print("Исправленный запрос:", q2)
        print("\n RAG: \n", a_rag)
        print("\nТоп статьи:")
        for h in hits:
            print(f"  [{h['rank']}] {h['title']} ({h['score']:.3f})")

Вопрос: Federeted learnnng with differential privacy: key ideas and challenges
Исправленный запрос: federated learning with differential privacy key ideas and challenges

 RAG: 
 Federated learning combined with differential privacy presents unique challenges and opportunities. Federated learning allows decentralized training of models without sharing sensitive data, which is crucial in sensitive domains like healthcare. However, ensuring patient privacy while maintaining model accuracy is complex. 

One study, "Federated and Differentially Private Learning for Electronic Health Records" by Stephen R. Pfohl et al., explores the efficacy of centralized versus federated learning in private and non-private settings, particularly for clinical prediction tasks. It highlights that while applying differentially private stochastic gradient descent is straightforward in centralized settings, it becomes significantly more challenging in federated contexts.

Another relevant work, "Federated Lear