<a href="https://colab.research.google.com/github/Seraph1604/AAI_RAG/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U sentence-transformers faiss-cpu
!pip install pymupdf
!pip install --upgrade torch torchvision torchaudio
!pip install --upgrade sentence-transformers
!pip install google-generativeai

Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Using cached torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Using cached torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cusparselt-cu12==0.6.2 (from torch)
  Using cached nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting triton==3.2.0 (from torch)
  Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl (150.1 MB)
Using cached triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253.2 MB)
Using cached torchv



In [None]:
def parse_text(text, chunk_size=750, overlap=50):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks

In [None]:
import fitz  # PyMuPDF
import re

def extract_text_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    text = text.replace("\xa0", " ")
    text = text.replace("\n", " ")
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s,.-]', '', text)
    return text


In [None]:
text1 = extract_text_pdf('docs/doc1.pdf')
text2 = extract_text_pdf('docs/doc2.pdf')

In [None]:
docs = []

In [None]:
import pandas as pd
df = pd.read_csv('docs/cards.csv')


for _, row in df.iterrows():
    row_str = [f"{col}: {row[col]}" for col in df.columns]
    row_str = " ".join(row_str)
    docs.append(row_str)
text3 = "\n".join(docs)


In [None]:
docs += parse_text(text1, chunk_size = 500, overlap = 10) + parse_text(text2, chunk_size = 250, overlap = 5)

In [None]:
len(docs)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch

model1 = SentenceTransformer("multi-qa-mpnet-base-cos-v1")
'''
embeddings = model1.encode(docs, convert_to_tensor=True)
# Save embeddings to a file
np.save("embeddings.npy", embeddings.cpu().numpy())
'''
# Later, to load the embeddings:
embeddings = torch.tensor(np.load("embeddings.npy"))

In [None]:
embeddings.shape

In [None]:
import torch
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.cpu().numpy())

def search(query, top_k=5):
    query_embedding = model1.encode([query], convert_to_tensor=True)

    query_embedding = query_embedding.cpu().numpy()

    distances, indices = index.search(query_embedding, top_k)
    return indices[0]



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

def find_similar(query, documents, tfidf_matrix, top_n=5):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:top_n]
    return top_indices

In [None]:
import google.generativeai as genai

with open('docs/api.txt', 'r') as file:
    api_key = file.readline().strip()

genai.configure(api_key = api_key)
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
import numpy as np

In [None]:
query = "что обязуется сделать банк?"
results = search(query, 5)
resTFIDF = find_similar(query, docs, tfidf_matrix)
combined_results = np.concatenate((results, resTFIDF))
unique_results = np.unique(combined_results)
question_list = [docs[i] for i in unique_results]
question = 'Каждое предложение ответа пиши в новой строчке. Ответь на вопрос: '+ query + '|'.join(question_list)
response = model.generate_content(question)
print(response.text)

Банк обязуется соблюдать конфиденциальность информации о клиенте.

Банк обязуется предоставить клиенту кредит в соответствии с кредитным договором.

Банк обязуется в течение 30 календарных дней подать в Росреестр заявление о погашении регистрационной записи об ипотеке (при полном исполнении клиентом обязательств по кредитному договору).

Банк обязуется возвратить сумму денежных средств и начисленные проценты в порядке и на условиях, предусмотренных договором вклада/договором накопительного счета.

