In [4]:
import openai
import os
import json
import shutil

from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Run this interactively in your terminal to generate a config
rayllm gen-config

In [None]:
# Deploy the serve app to production with a given service name.
# Reference the serve file created in step 1
!anyscale service deploy -f serve_mistral_7b.yaml

In [None]:
def prompt_llm(service_url: str, prompt: str, model: str ="mistralai/Mistral-7B-Instruct-v0.1", temperature: float = 0, **kwargs):

    # Ensure URL has a trailing backslash
    if not service_url.endswith("/"):
        service_url += "/"
    
    if "/routes" in service_url:
        raise ValueError("service_url must end with '.com'")

    # Initialize a client to perform API requests
    client = openai.OpenAI(
        base_url=ANYSCALE_SERVICE_BASE_URL + "v1",
        api_key=ANYSCALE_API_KEY,
    )
    
    # Call the chat completions endpoint
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            # Prime the system with a system message - a common best practice
            {"role": "system", "content": "You are a helpful assistant."},
            # Send the user message with the proper "user" role and "content"
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        **kwargs,
    )

    return chat_completion

In [None]:
service_url = "https://wheel-of-time-rag-3136s.cld-8lqvbtr41isy21zu.s.anyscaleuserdata.com/"
prompt = "Tell me something about Wheel of Time"
response = prompt_llm("service_url", prompt)
print(response.choices[0].message.content)

## Create RAG

In [None]:
# if os.environ.get("ANYSCALE_ARTIFACT_STORAGE"):
#     DATA_DIR = Path("/mnt/cluster_storage/")
#     shutil.copytree(Path("data/"), DATA_DIR, dirs_exist_ok=True)
# else:
#     DATA_DIR = Path("./data/")

DATA_DIR = Path("./data/")

In [None]:
import os
from ebooklib import epub
from bs4 import BeautifulSoup

def parse_epub_content(epub_file: Path) -> list:
    """Extracts and returns text content from an EPUB file."""
    book = epub.read_epub(epub_file)
    content = []

    for item in book.get_items():
        # Check if the media type is HTML/XHTML
        if item.media_type == 'application/xhtml+xml':
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            # Extract text from each HTML section
            content.append(soup.get_text())

    # Join all sections to return as a single string
    return '\n'.join(content)

def parse_epub_directory(directory_path: Path) -> dict:
    """Parses all EPUB files in a directory and returns a dictionary with file names and content."""
    epub_texts = {}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.epub'):
            epub_path = os.path.join(directory_path, filename)
            text = parse_epub_content(epub_path)
            epub_texts[filename] = text
            
    return epub_texts

In [None]:
# book00 = parse_epub_content('data/library/00 - New Spring.epub')
book01 = parse_epub_content('data/library/01 - The Eye of the World.epub')
# book02 = parse_epub_content('data/library/02 - The Great Hunt.epub')
# book03 = parse_epub_content('data/library/03 - The Dragon Reborn.epub')
# book04 = parse_epub_content('data/library/04 - The Shadow Rising.epub')
# book05 = parse_epub_content('data/library/05 - The Fires of Heaven.epub')
# book06 = parse_epub_content('data/library/06 - Lord of Chaos.epub')
# book07 = parse_epub_content('data/library/07 - A Crown of Swords.epub')
# book08 = parse_epub_content('data/library/08 - The Path of Daggers.epub')
# book09 = parse_epub_content('data/library/09 - Winter\'s Heart.epub')
# book10 = parse_epub_content('data/library/10 - Crossroads of Twilight.epub')
# book11 = parse_epub_content('data/library/11 - Knife of Dreams.epub')
# book12 = parse_epub_content('data/library/12 - The Gathering Storm.epub')
# book13 = parse_epub_content('data/library/13 - Towers of Midnight.epub')
# book14 = parse_epub_content('data/library/14 - A Memory of Light.epub')

print(book01)

In [None]:
directory_path = DATA_DIR / 'library'
epub_data = parse_epub_directory(directory_path)

### Split books into chunks

In [None]:
chunk_size = 128  #  Chunk size is usually specified in tokens
words_to_tokens = 1.2  # Heuristic for converting tokens to words
chunk_size_in_words = int(chunk_size // words_to_tokens)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size_in_words,
    length_function=lambda x: len(x.split()),
    chunk_overlap=0,
)

chunks = []

for title, text in epub_data.items():
    for chunk in splitter.split_text(text):
        chunks.append(
            {
                "text": chunk,
                "book_title": title,
            }
        )

# for idx, row in df.iterrows():
#     for chunk in splitter.split_text(row["text"]):
#         chunks.append(
#             {
#                 "text": chunk,
#                 "section_url": row["section_url"],
#                 "page_url": row["page_url"],
#             }
#         )