# 1. Artykuły Naukowe

Import potrzebnych bibliotek do załadowania i przetworzenia .pdf

In [16]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

Ładowanie plików PDF, przygotujemy funkcje do ładowania pojedynczego pliku PDF

In [17]:
def load_pdf(article):
    loader = PyPDFLoader(article)
    pages = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        separators=["\n\n", "\n", "", " "]
    )
    docs = text_splitter.split_documents(pages)
    
    return docs

Teraz wczytamy wszystkie nasze pliki PDF

In [18]:
articles = ["f1-article-1.pdf", "f1-article-2.pdf", "f1-article-3.pdf"]

docs = []

for article in articles:
    docs_one_file = load_pdf(article)
    docs.extend(docs_one_file)
    
for idx,doc in enumerate(docs):
  print(f"[{idx}].{doc}\n\n")
    

Ignoring wrong pointing object 19 0 (offset 0)


[0].page_content='J. Quant. Anal. Sports 2023; 19(4): 273–293
ResearchArticle
Erik-Jan van Kesteren* and Tom Bergkamp
BayesiananalysisofFormulaOneraceresults:
disentanglingdriverskillandconstructor
advantage
https://doi.org/10.1515/jqas-2022-0021
Received March 15, 2022; accepted June 2, 2023;' metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'LaTeX with hyperref package + hypdvips', 'creationdate': '2023-10-27T10:40:46+05:30', 'subject': '', 'author': 'Erik-Jan van Kesteren', 'moddate': '2023-10-27T10:42:27+05:30', 'title': 'Bayesian analysis of Formula One race results: disentangling driver skill and constructor advantage', 'source': 'f1-article-1.pdf', 'total_pages': 21, 'page': 0, 'page_label': '273'}


[1].page_content='advantage
https://doi.org/10.1515/jqas-2022-0021
Received March 15, 2022; accepted June 2, 2023;
published online July 25, 2023
Abstract: SuccessfulperformanceinFormulaOneisdeter-
minedbycombinationofboththedriver’sskillandrace-car
constructor 

# 2. Wektorowa Baza Danych i Embeddingi



Import bibliotek potrzebnych do stworzenia Wektorowej Bazy Danych 

In [19]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS

Zainicjujmy teraz nasz model embeddings

In [66]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)

I stwórzmy bazę danych

In [21]:
db = FAISS.from_documents(docs, embeddings)
db.save_local("faiss_index_hf")

# 3. LLM

Import wymaganych bibliotek

In [22]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from langchain.chains import RetrievalQA

In [68]:
model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024
)

llm = HuggingFacePipeline(pipeline=pipe)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
)


Device set to use cpu


# 4. Generowanie odpowiedzi w pętli

Teraz spróbuję przeprowadzić z modelem rozmowę na temat f1

In [70]:
end_of_conversation = False
while end_of_conversation == False:
    quote = str(input("Wprowadź wiadomość:"))
    print(f"User: {quote}")
    result = qa_chain.invoke(quote)
    print(f"AI Assistant: {result['result']} \n\n")
    if input("Continue conversation?(y/n)") == "y":
        pass
    else:
        end_of_conversation = True
    

User: what is f1?
AI Assistant: a sport of genuine global appeal. Established in 1950, F1 has also grown into a huge business enterprise, with sponsorship and commercialism drawn to the sport by the 527 million 


User: driver or car matter more?
AI Assistant: car 


User: which team won most amount of races?
AI Assistant: Nico Rosberg Mercedes 6 3 Sebastian Vettel Ferrari 2 4 Kimi Räikkönen Ferrari 4 5 Valtteri Bottas Williams 12 6 Felipe Massa Williams 13 7 Daniil Kvyat Red Bull 9 8 Daniel Ricciardo Red Bull 7 9 Sergio Pérez Force India 16 10 Nico Hülkenberg Force India 




Model poradził sobie dobrze z zadanymi pytaniami, w 3 troche dziwnie wyrzucił ale to pewnie przez dane w pdf a nie jego więc git

# 5. Brak Cyberpsychozy

Dodamy teraz obsługę żeby model wyrzucał nie wiem a nie dostawał cyberpsychozy

In [71]:
from langchain import PromptTemplate

In [72]:
template = """If the context does not contain the answer to the question, reply only with "I don't know".
Do not try to guess or copy text.

Context: {context}

Question: {question}

"""

prompt_template = PromptTemplate(template=template, input_variables=['context', 'question'])

In [73]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": prompt_template}
)

In [75]:
end_of_conversation = False
while end_of_conversation == False:
    quote = str(input("Wprowadź wiadomość:"))
    print(f"User: {quote}")
    result = qa_chain.invoke(quote)
    print(f"AI Assistant: {result['result']} \n\n")
    if input("Continue conversation?(y/n)") == "y":
        pass
    else:
        end_of_conversation = True

User: which driver won wdc in 2024?
AI Assistant: I don't know 




Udało się dodać obsługę cyberpsychozy, więc jest git na więcej już nie mam czasu bo mnie modele pokonały bo mi 90% nie dzialalo poprawnie xd