In [2]:
%%capture
!pip install -q pyngrok dagshub mlflow python-dotenv langchain langchain-openai langchain-together langchain-community pypdf pinecone streamlit langchain-pinecone langchain-huggingface

In [None]:
#@markdown **You need to sign up for [DagsHub](https://dagshub.com/user/sign_up) , then enter the name of the repository you'd like to create, and your username and email.**

#@markdown Enter the repository name for the project:
REPO_NAME= "komrag" #@param {type:"string"}

#@markdown Enter the username of your DAGsHub account:
USER_NAME = "OsipovStas" #@param {type:"string"}

#@markdown Enter the email for your DAGsHub account:
EMAIL = "stasstels@gmail.com" #@param {type:"string"}

#@markdown ---

import IPython

In [4]:
from dotenv import load_dotenv

load_dotenv("../app/.env", override=True)

True

In [None]:
from google.colab import userdata
DH_TOKEN = userdata.get('LLM_MATH_COMP_DH_TOKEN')


In [None]:
import os

DH_TOKEN = os.environ['LLM_MATH_COMP_DH_TOKEN']

In [None]:
import mlflow
import dagshub
import os
#DH_TOKEN = os.environ['LLM_MATH_COMP_DH_TOKEN']
username = f'{USER_NAME}'  # Replace with your DagsHub username
repository = f'{REPO_NAME}'  # Replace with your repository name
token = f'{DH_TOKEN}'  # Replace with your DagsHub token
os.environ['MLFLOW_TRACKING_USERNAME'] = username
os.environ['MLFLOW_TRACKING_PASSWORD'] = token
dagshub.auth.add_app_token(token)
dagshub.init(repo_name=REPO_NAME, repo_owner=USER_NAME)
mlflow.set_tracking_uri(f"https://dagshub.com/{USER_NAME}/{REPO_NAME}.mlflow")

In [None]:
os.environ['LANGCHAIN_API_KEY'] = userdata.get("LANGCHAIN_API_KEY")
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "komrag"

In [None]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("OsipovStas/komrag")

# Upload file
s3.upload_file(
    Bucket="komrag",  # name of the repo
    Filename="./life_begin.pdf",  # local path of file to upload
    Key="life_begin.pdf",  # remote path where to upload the file
)

In [None]:
from dagshub import get_repo_bucket_client
# Get a boto3.client object
s3 = get_repo_bucket_client("OsipovStas/komrag")


s3.download_file(
    Bucket="komrag",  # name of the repo
    Key="life_begin.pdf",  #  remote path from where to download the file
    Filename="life_begin.pdf",  # local path where to download the file
)

In [None]:
from dagshub.notebook import save_notebook

save_notebook(repo=f"{USER_NAME}/{REPO_NAME}", path="./notebooks/exploratory-local.ipynb")

# LLM Configuration

In [None]:
model="gpt-4o-mini-2024-07-18"

In [None]:
from langchain_openai import AzureChatOpenAI

chat = AzureChatOpenAI(model=model, temperature=0, timeout=120)

In [None]:
from langchain_together import ChatTogether

# choose from our 50+ models here: https://docs.together.ai/docs/inference-models
chat = ChatTogether(
    together_api_key=userdata.get("TOGETHER_KEY"),
    model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
    timeout=120
)

In [None]:
chat.invoke("Hello, do you know russian? Write a response in it")

In [None]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(model="text-embedding-3-small-1", dimensions=768)

In [None]:
from langchain_together import TogetherEmbeddings

embeddings = TogetherEmbeddings(
    together_api_key=userdata.get("TOGETHER_KEY"),
    model="togethercomputer/m2-bert-80M-8k-retrieval",
)



In [None]:
len(embeddings.embed_query("Приветы, как дела?"))

# Loading

In [None]:
file_path = "life_begin.pdf"

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [None]:
len(pages)

# Text cleanup

In [None]:
template="""
You will receive a text in Russian extracted using an OCR tool.
Your task is to clean and format this text to enhance readability.
The text may contain various artifacts introduced during the OCR process, such as:

 - Special characters or symbols that do not belong.
 - Inconsistent casing of letters (e.g., random uppercase or lowercase letters).
 - Incorrect spacing within words or between words (e.g., words glued together or unnecessary spaces).
 - Hyphenation errors, such as words split across lines.
 - Incorrect punctuation or misplaced commas and periods.
 - New line characters

Please ensure the final output is free from these artifacts and is well-formatted.
Pay attention to punctuation, paragraph structure, and overall coherence.
The goal is to produce a clean, easy-to-read text.
Output ONLY the cleaned text.


### TEXT ###

{text}

"""

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = prompt | chat | StrOutputParser()

In [None]:
print(chain.invoke({"text": pages[30].page_content}))

In [None]:
def clean_text(text):
    # try up to 3 times if no success return text as is
    for _ in range(3):
        try:
            return (chain.invoke({"text": text}), True)
        except Exception as e:
            print(f"Error cleaning text: {e}")
    return (text, False)

In [None]:
from tqdm import tqdm

failures = []
for p in tqdm(pages, desc="Cleaning text", unit="page"):
  text, suc = clean_text(p.page_content)
  if not suc:
    failures.append(p)
  else:
    p.page_content = text

In [None]:
print(pages[3].page_content)

# Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(pages)

len(splits)

In [None]:
splits[8]

# Indexing

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

In [None]:
from pinecone import Pinecone, ServerlessSpec


pc = pinecone.Pinecone(api_key=userdata.get('PINECONE_KEY'))

In [None]:
pc.list_indexes()

In [None]:
import time

index_name = "komrag"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [None]:
pc.list_indexes()

In [None]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
import hashlib

def generate_id(content):
    # Ensure the content is in bytes
    if isinstance(content, str):
        content = content.encode('utf-8')

    # Create a SHA-256 hash object
    hash_object = hashlib.sha256()

    # Update the hash object with the content
    hash_object.update(content)

    # Get the hexadecimal representation of the hash
    unique_id = hash_object.hexdigest()

    return unique_id

In [None]:
def index_all_docs(docs):
  # process docs by batches of 100 docs using tqdm
  for i in tqdm(range(0, len(docs), 100), desc="Indexing", unit="batch"):
    index_batch(docs[i:i+100])

def index_batch(docs):
  ids = [generate_id(doc.page_content) for doc in docs]
  vector_store.add_documents(documents=docs, ids=ids)

In [None]:
index_all_docs(pages)

# Retrieve

In [None]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("В какой температуре купать ребенка?")

len(retrieved_docs)

In [None]:
print(retrieved_docs[5].page_content)

# Retrieval

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()

example_messages

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:

for chunk in rag_chain.stream("При какой температуре купать ребенка?"):
    print(chunk, end="", flush=True)

# NGROK

In [None]:
from pyngrok import ngrok

ngrok.set_auth_token(os.environ['NGROK_KEY'])


t=2024-10-20T14:39:57+0300 lvl=warn msg="invalid tunnel configuration" pg=/api/tunnels id=b7fac91c62981aa6 err="yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"


In [8]:
import os
print(os.environ['NGROK_KEY'])

2lKni4tEcD6SS67BDtSziTmLsUo_3yi5Kwrbo1dgiGfn3xAZx


In [11]:
# Start ngrok tunnel
public_url = ngrok.connect(addr='localhost:8501')
print(f"Streamlit app is live at {public_url}")

Streamlit app is live at NgrokTunnel: "https://994d-2602-815-0-4-00-b18.ngrok-free.app" -> "http://localhost:8501"
