In [43]:
import boto3
import openai
import os
import json
import psutil
import ray
import shutil
import tempfile

import pandas as pd

from bs4 import BeautifulSoup
from ebooklib import epub
from io import BytesIO
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Basic LLM example

In [None]:
# Run this interactively in your terminal to generate a config
rayllm gen-config

In [None]:
# Deploy the serve app to production with a given service name.
# Reference the serve file created in step 1
!anyscale service deploy -f serve_mistral_7b.yaml

In [None]:
def prompt_llm(service_url: str, prompt: str, model: str ="mistralai/Mistral-7B-Instruct-v0.1", temperature: float = 0, **kwargs):

    # Ensure URL has a trailing backslash
    if not service_url.endswith("/"):
        service_url += "/"
    
    if "/routes" in service_url:
        raise ValueError("service_url must end with '.com'")

    # Initialize a client to perform API requests
    client = openai.OpenAI(
        base_url=ANYSCALE_SERVICE_BASE_URL + "v1",
        api_key=ANYSCALE_API_KEY,
    )
    
    # Call the chat completions endpoint
    chat_completion = client.chat.completions.create(
        model=model,
        messages=[
            # Prime the system with a system message - a common best practice
            {"role": "system", "content": "You are a helpful assistant."},
            # Send the user message with the proper "user" role and "content"
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        **kwargs,
    )

    return chat_completion

In [None]:
service_url = "https://wheel-of-time-rag-3136s.cld-8lqvbtr41isy21zu.s.anyscaleuserdata.com/"
prompt = "Tell me something about Wheel of Time"
response = prompt_llm("service_url", prompt)
print(response.choices[0].message.content)

## Create RAG

### Download epubs from S3

In [None]:
import boto3
import os

def download_s3_bucket(bucket_name, local_dir='/data'):
    # Create an S3 client
    s3 = boto3.client('s3')
    
    # Ensure local directory exists
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
    
    # List all objects in the S3 bucket
    for obj in s3.list_objects_v2(Bucket=bucket_name)['Contents']:
        s3_key = obj['Key']
        local_path = os.path.join(local_dir, s3_key)
        
        # Create local directory structure if needed
        if not os.path.exists(os.path.dirname(local_path)):
            os.makedirs(os.path.dirname(local_path))
        
        # Download the file
        s3.download_file(bucket_name, s3_key, local_path)
        print(f"Downloaded {s3_key} to {local_path}")

# Usage
BUCKET_NAME = 'rag-wheel-of-time'
LOCAL_DATA_DIR = Path("./data/")

download_s3_bucket(BUCKET_NAME, LOCAL_DATA_DIR)

### Parse epubs to text

In [None]:
def parse_epub_content(epub_file: Path) -> list:
    """Extracts and returns text content from an EPUB file."""
    book = epub.read_epub(epub_file)
    content = []

    for item in book.get_items():
        # Check if the media type is HTML/XHTML
        if item.media_type == 'application/xhtml+xml':
            soup = BeautifulSoup(item.get_body_content(), 'html.parser')
            # Extract text from each HTML section
            content.append(soup.get_text())

    # Join all sections to return as a single string
    return '\n'.join(content)

def parse_epub_directory(directory_path: Path) -> dict:
    """Parses all EPUB files in a directory and returns a dictionary with file names and content."""
    epub_texts = {}
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.epub'):
            epub_path = os.path.join(directory_path, filename)
            text = parse_epub_content(epub_path)
            epub_texts[filename] = text
            
    return epub_texts

In [None]:
# book00 = parse_epub_content('data/library/00 - New Spring.epub')
book01 = parse_epub_content('data/library/01 - The Eye of the World.epub')
# book02 = parse_epub_content('data/library/02 - The Great Hunt.epub')
# book03 = parse_epub_content('data/library/03 - The Dragon Reborn.epub')
# book04 = parse_epub_content('data/library/04 - The Shadow Rising.epub')
# book05 = parse_epub_content('data/library/05 - The Fires of Heaven.epub')
# book06 = parse_epub_content('data/library/06 - Lord of Chaos.epub')
# book07 = parse_epub_content('data/library/07 - A Crown of Swords.epub')
# book08 = parse_epub_content('data/library/08 - The Path of Daggers.epub')
# book09 = parse_epub_content('data/library/09 - Winter\'s Heart.epub')
# book10 = parse_epub_content('data/library/10 - Crossroads of Twilight.epub')
# book11 = parse_epub_content('data/library/11 - Knife of Dreams.epub')
# book12 = parse_epub_content('data/library/12 - The Gathering Storm.epub')
# book13 = parse_epub_content('data/library/13 - Towers of Midnight.epub')
# book14 = parse_epub_content('data/library/14 - A Memory of Light.epub')

print(book01)

In [None]:
directory_path = LOCAL_DATA_DIR / 'library'
epub_data = parse_epub_directory(directory_path)

### Split books into chunks

In [None]:
chunk_size = 128  #  Chunk size is usually specified in tokens
words_to_tokens = 1.2  # Heuristic for converting tokens to words
chunk_size_in_words = int(chunk_size // words_to_tokens)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size_in_words,
    length_function=lambda x: len(x.split()),
    chunk_overlap=0,
)

chunks = []

for title, text in epub_data.items():
    for chunk in splitter.split_text(text):
        chunks.append(
            {
                "text": chunk,
                "book_title": title,
            }
        )

In [None]:
# Print sample
print(chunks[0:10])

### Generate embeddings from chunks

In [None]:
model = SentenceTransformer('thenlper/gte-large', device='cpu')

In [None]:
# library_path = "." / LOCAL_DATA_DIR / "library"
# file_paths = [str(f"/{file}") for file in library_path.rglob('*') if file.is_file()]

# print(file_paths)

books = ray.data.read_binary_files(
    paths = "s3://rag-wheel-of-time/",
    )

2024-10-31 13:49:10,201	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-10-31_12-48-01_891017_2361/logs/ray-data
2024-10-31 13:49:10,201	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ExpandPaths] -> TaskPoolMapOperator[ReadFiles]


Running 0: 0.00 row [00:00, ? row/s]

- ExpandPaths 1: 0.00 row [00:00, ? row/s]

- ReadFiles 2: 0.00 row [00:00, ? row/s]

15

In [25]:
books.count()

2024-10-31 14:46:20,348	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-10-31_12-48-01_891017_2361/logs/ray-data
2024-10-31 14:46:20,348	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ExpandPaths] -> TaskPoolMapOperator[ReadFiles]


Running 0: 0.00 row [00:00, ? row/s]

- ExpandPaths 1: 0.00 row [00:00, ? row/s]

- ReadFiles 2: 0.00 row [00:00, ? row/s]

15

In [16]:
ray.data.DataContext.get_current().log_internal_stack_trace_to_stdout = True

In [None]:
def parse_epub_content(epub_bytes) -> str:
    """Extracts and returns text content from an EPUB file given as raw bytes."""

    with tempfile.NamedTemporaryFile(delete=True, suffix='.epub') as temp_file:
        temp_file.write(epub_bytes['bytes'])  # Write bytes to the temporary file
        temp_file.flush()  # Ensure all data is written

        book = epub.read_epub(temp_file.name)  # Read the temporary file
        content = []


        for item in book.get_items():
            # Check if the media type is HTML/XHTML
            if item.media_type == 'application/xhtml+xml':
                soup = BeautifulSoup(item.get_body_content(), 'html.parser')
                # Extract text from each HTML section
                content.append(soup.get_text())

    # Join all sections to return as a single string
    return {'book_content' : '\n'.join(content)}

books_parsed = books.map(parse_epub_content)
result = books_parsed.take(2)

2024-10-31 15:21:13,820	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-10-31_12-48-01_891017_2361/logs/ray-data
2024-10-31 15:21:13,820	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ExpandPaths] -> TaskPoolMapOperator[ReadFiles] -> TaskPoolMapOperator[Map(parse_epub_content)] -> LimitOperator[limit=2]


Running 0: 0.00 row [00:00, ? row/s]

- ExpandPaths 1: 0.00 row [00:00, ? row/s]

- ReadFiles 2: 0.00 row [00:00, ? row/s]

- Map(parse_epub_content) 3: 0.00 row [00:00, ? row/s]

- limit=2 4: 0.00 row [00:00, ? row/s]

In [52]:
print(result[1]['book_content'][:5000])








 
“The Eye of the World is the best of its genre.”
—The Ottawa Citizen
 
“A splendid tale of heroic fantasy, vast in scope, colorful in detail, and convincing in its presentation of human character and personality.”
—L. Sprague De Camp
 
“This richly detailed fantasy presents fully realized, complex adventure. Recommended.”
—Library Journal
 
“This one is as solid as a steel blade and glowing with the true magic. Robert Jordan deserves congratulations.”
—Fred Saberhagen
 
“One hell of a story. [It] kept me up past my bedtime for three nights running—and it’s been a long time since a novel’s done that.”
—Baird Searles,Isaac Asimov’s Science Fiction Magazine
 
“A future collector’s item. Jordan has brought out a completely new allegory in a fantasy concept that goes even beyond this massive story, working with an artist’s eye and the sense of responsibility of a serious historian.”
—Gordon R. Dickson
 
“Classic oppositions are brought into play: initiates versus innocents, good ve