In [9]:
import spacy
import pandas as pd
import re
import time
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from docx import Document
import chromadb
from openai import OpenAI

class Chat:
    def __init__(self, file_path):
        self.model_id = "sentence-transformers/paraphrase-MiniLM-L3-v2"
        self.device = "cuda:0"
        self.dim = 384
        self.file_path = file_path
        chroma_client = chromadb.PersistentClient(path="./cto-demo_1")
        self.collection = chroma_client.create_collection(
            name="book",
            metadata={"hnsw:space": "cosine"}
        )
        full_text = self.read_docx(self.file_path)
        splitted_txt = self.splitter(full_text)
        self.model = self._encode()
        encoded_text = self.model.encode(splitted_txt, show_progress_bar=True).tolist()
        ids = [str(i) for i in range(len(encoded_text))]
        self.collection.add(
            documents=splitted_txt,
            embeddings=encoded_text,
            ids=ids
        )
        self.system = """
                I'll provide you with a JSON object that contains a question and the context related to it:
                {"question": the question, "context": the context}
                Please generate the answer of the provided question based on the context above.
                """
        api_key = "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"
        self.client = OpenAI(api_key=api_key)

    def run(self, question):
        question_embed = self.model.encode(question)
        results = self.collection.query(
            query_embeddings=question_embed.tolist(),
            n_results=3,  
        )
        top_paragraph = ' '.join([i for i in results['documents']][0])
        prompt = '{"question": ' + question + ', "context": ' + top_paragraph + '}'

        return self._get_apen_ai_answer(prompt)

    def read_docx(self, file_path):
        doc = Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        full_text = '\n'.join(full_text)

        return full_text

    def splitter(self, txt):
        
        chunk_size = 1000
        chunk_overlap = 200

        def length_function(text: str) -> int:
            return len(text)

        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
        )

        return splitter.split_text(txt)

    def brain_dataset(self, text, threshold=100, dataset_name="dataset"):
        cleaned_text = re.sub(r'[^a-zA-Z0-9.%\s]', '', text)
        info_list = list(set(cleaned_text.split("\n")))
        info_list = [para for para in info_list if para.strip() != ""]

        nlp = spacy.load('en_core_web_sm')

        for i, text in enumerate(info_list):
            if len(text.split()) > threshold:
                doc = nlp(text)
                paragraphs = [paragraph.text for paragraph in doc.sents]
                info_list.pop(i)
                info_list[i:i] = paragraphs
                ner_results = [(ent.text, ent.label_) for ent in doc.ents]

        df = pd.DataFrame(info_list, columns=['paragraph_info'])
        dataset = df.to_csv(f"{dataset_name}.csv", index=False)

        return dataset, ner_results
    
    def _encode(self):
        return SentenceTransformer(self.model_id, device=self.device)
    
    def _get_apen_ai_answer(self, prompt):
        response = self.client.chat.completions.create(
            model = "gpt-3.5-turbo-1106",
            temperature= 0,
            messages=[
                {"role": "system", "content": self.system},
                {"role": "user", "content": prompt}
            ]
        )

        return response.choices[0].message.content


In [10]:
answer_me = Chat("Master Machine Learning Algorithms - Discover how they work by Jason Brownlee (z-lib.org).docx")

Batches: 100%|██████████| 12/12 [00:00<00:00, 26.93it/s]


In [12]:
x = answer_me.run("what is the main topic of the book?")
x

'The main topic of the book is to teach machine learning algorithms from scratch, specifically focusing on the type of machine learning where models are built to make predictions on new data, known as predictive modeling. The book is intended for developers and does not assume a background in statistics, probability, linear algebra, or machine learning. It is recommended to read the book linearly from start to finish, working through the tutorials provided to gain a practical understanding of the concepts and algorithms described.'