In [7]:
!pip install faiss-cpu transformers langchain pdfplumber -q
!pip install -U langchain-huggingface -q

In [5]:
import os
import pdfplumber
import faiss
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline

# Schritt 1: PDF-Dokument 'tsl-10k-report.pdf' extrahieren
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Schritt 2: Text in Chunks aufteilen (für Embeddings)
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Schritt 3: Embeddings erstellen
def create_embeddings(chunks):
    embeddings_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
    embeddings = embeddings_model.embed_documents(chunks)
    return np.array(embeddings), embeddings_model

# Schritt 4: FAISS Vektordatenbank erstellen
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2-Distanz für Ähnlichkeitsvergleich
    index.add(embeddings)
    return index

# Schritt 5: Frage verarbeiten und Antwort finden
def find_answer_in_pdf(question, chunks, index, embeddings_model):
    # Frage als Embedding
    question_embedding = embeddings_model.embed_query(question)

    # Ähnlichste Chunks finden
    D, I = index.search(np.array([question_embedding]), k=3)
    closest_chunks = [chunks[i] for i in I[0]]

    # Gib die abgerufenen Chunks aus
    print("Abgerufene Chunks für die Frage:")
    for i, chunk in enumerate(closest_chunks):
        print(f"Chunk {i+1}:")
        print(chunk)
        print()

    # Combine chunks into a single context
    context = " ".join(closest_chunks)

    return context, closest_chunks

# Schritt 6: Extraktive Antwort mit BERT basierend auf dem Kontext
def generate_answer(question, context):
    # Verwende BERT für Frage-Antwort-Aufgaben
    qa_pipeline = pipeline('question-answering', model='bert-large-uncased-whole-word-masking-finetuned-squad')

    # Frage beantworten mit BERT
    result = qa_pipeline({
        'question': question,
        'context': context
    })

    return result['answer']

# Hauptfunktion zum Ausführen des Projekts
def main(pdf_path, question):
    # 1. Extrahiere Text aus dem PDF
    text = extract_text_from_pdf(pdf_path)
    
    # 2. Text in Chunks aufteilen
    chunks = split_text_into_chunks(text)
    
    # 3. Embeddings erstellen
    embeddings, embeddings_model = create_embeddings(chunks)
    
    # 4. FAISS Index erstellen
    index = create_faiss_index(embeddings)
    
    # 5. Frage beantworten und Chunks anzeigen
    context, closest_chunks = find_answer_in_pdf(question, chunks, index, embeddings_model)
    
    # 6. Antwort generieren basierend auf dem abgerufenen Kontext mit BERT
    answer = generate_answer(question, context)
    
    return answer, closest_chunks

# Setze Umgebungsvariable für Windows Symlink-Cache-Problem
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Ausführen
pdf_path = "tsl-10k-report.pdf"  # Pfad zu deinem PDF-Dokument
question = "Who is the CEO of the company?"
answer, retrieved_chunks = main(pdf_path, question)

# Ausgabe der Antwort
print("Antwort auf die Frage:")
print(answer)


Abgerufene Chunks für die Frage:
Chunk 1:
.5 Achieved
$ 35.0 Achieved $ 3.0 Achieved
$ 55.0 Achieved $ 4.5 Achieved
$ 75.0 Achieved $ 6.0 Achieved
$ 100.0 - $ 8.0 Achieved
$ 125.0 - $ 10.0 Achieved
$ 150.0 - $ 12.0 Achieved
$ 175.0 - $ 14.0 Achieved
Stock-based compensation under the 2018 CEO Performance Award represented a non-cash expense and was recorded as a Selling, general, and
administrative operating expense in our consolidated statements of operations. In each quarter since the grant of the 2018 CEO Performance Award, we
had recognized expense, generally on a pro-rated basis, for only the number of tranches (up to the maximum of 12 tranches) that corresponded to the
number of operational milestones that had been achieved or had been determined probable of being achieved in the future, in accordance with the
following principles.
On the grant date, a Monte Carlo simulation was used to determine for each tranche (i) a fixed amount of expense for such tranche and (ii) the
future 

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Antwort auf die Frage:
Elon Musk


In [6]:
import os
import pdfplumber
import faiss
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import pipeline

# Schritt 1: PDF-Dokument 'tsl-10k-report.pdf' extrahieren
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Schritt 2: Text in Chunks aufteilen (für Embeddings)
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Schritt 3: Embeddings erstellen
def create_embeddings(chunks):
    embeddings_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
    embeddings = embeddings_model.embed_documents(chunks)
    return np.array(embeddings), embeddings_model

# Schritt 4: FAISS Vektordatenbank erstellen
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2-Distanz für Ähnlichkeitsvergleich
    index.add(embeddings)
    return index

# Schritt 5: Frage verarbeiten und Antwort finden
def find_answer_in_pdf(question, chunks, index, embeddings_model):
    # Frage als Embedding
    question_embedding = embeddings_model.embed_query(question)

    # Ähnlichste Chunks finden
    D, I = index.search(np.array([question_embedding]), k=3)
    closest_chunks = [chunks[i] for i in I[0]]

    # Gib die abgerufenen Chunks aus
    print("Abgerufene Chunks für die Frage:")
    for i, chunk in enumerate(closest_chunks):
        print(f"Chunk {i+1}:")
        print(chunk)
        print()

    # Combine chunks into a single context
    context = " ".join(closest_chunks)

    return context, closest_chunks

# Schritt 6: Generative Antwort mit T5 basierend auf dem Kontext
def generate_answer(question, context):
    # Verwende T5 für generative Frage-Antwort-Aufgaben
    generator = pipeline('text2text-generation', model='t5-base')

    # Kombiniere die Frage und den Kontext in einem Prompt
    prompt_template = f"question: {question} context: {context}"

    # Generiere mehrere Antworten (begrenzt auf 200 Tokens für neue Tokens)
    result = generator(prompt_template, max_length=200, num_return_sequences=1)

    return result[0]['generated_text']

# Hauptfunktion zum Ausführen des Projekts
def main(pdf_path, question):
    # 1. Extrahiere Text aus dem PDF
    text = extract_text_from_pdf(pdf_path)
    
    # 2. Text in Chunks aufteilen
    chunks = split_text_into_chunks(text)
    
    # 3. Embeddings erstellen
    embeddings, embeddings_model = create_embeddings(chunks)
    
    # 4. FAISS Index erstellen
    index = create_faiss_index(embeddings)
    
    # 5. Frage beantworten und Chunks anzeigen
    context, closest_chunks = find_answer_in_pdf(question, chunks, index, embeddings_model)
    
    # 6. Antwort generieren basierend auf dem abgerufenen Kontext mit T5
    answer = generate_answer(question, context)
    
    return answer, closest_chunks

# Setze Umgebungsvariable für Windows Symlink-Cache-Problem
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# Ausführen
pdf_path = "tsl-10k-report.pdf"  # Pfad zu deinem PDF-Dokument
question = "Who is the CEO of the company"
answer, retrieved_chunks = main(pdf_path, question)

# Ausgabe der Antwort
print("Antwort auf die Frage:")
print(answer)


Abgerufene Chunks für die Frage:
Chunk 1:
ect the registrant’s ability to record, process, summarize and report financial information; and
(b) Any fraud, whether or not material, that involves management or other employees who have a significant role in the registrant’sinternal control over financial reporting.
Date: January 26, 2024 /s/ Elon Musk
Elon Musk
Chief Executive Officer
(Principal Executive Officer)
Exhibit 31.2
CERTIFICATIONS
I, Vaibhav Taneja, certify that:
1. I have reviewed this Annual Report on Form 10-K of Tesla, Inc.;
2. Based on my knowledge, this report does not contain any untrue statement of a material fact or omit to state a material fact necessary to make the
statements made, in light of the circumstances under which such statements were made, not misleading with respect to the period covered by this
report;
3. Based on my knowledge, the financial statements, and other financial information included in this report, fairly present in all material respects the
fin

In [41]:
import os
import pdfplumber
import faiss
import numpy as np
from transformers import pipeline
from langchain_huggingface import HuggingFaceEmbeddings

# Schritt 1: PDF-Dokument 'tsl-10k-report.pdf' extrahieren
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Schritt 2: Text in Chunks aufteilen (für Embeddings)
def split_text_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

# Schritt 3: Embeddings erstellen
def create_embeddings(chunks):
    embeddings_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
    embeddings = embeddings_model.embed_documents(chunks)
    return np.array(embeddings), embeddings_model

# Schritt 4: FAISS Vektordatenbank erstellen
def create_faiss_index(embeddings):
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2-Distanz für Ähnlichkeitsvergleich
    index.add(embeddings)
    return index

# Schritt 5: Frage verarbeiten und Antwort finden
def find_answer_in_pdf(question, chunks, index, embeddings_model):
    # Frage als Embedding
    question_embedding = embeddings_model.embed_query(question)

    # Ähnlichste Chunks finden
    D, I = index.search(np.array([question_embedding]), k=3)
    closest_chunks = [chunks[i] for i in I[0]]

    # Gib die abgerufenen Chunks aus
    print("Abgerufene Chunks für die Frage:")
    for i, chunk in enumerate(closest_chunks):
        print(f"Chunk {i+1}:")
        print(chunk)
        print()

    # Combine chunks into a single context
    context = " ".join(closest_chunks)

    return context, closest_chunks

# Schritt 7: Generative Antwort mit GPT-J basierend auf dem Kontext
def generate_answer(question, context):
    # Verwende GPT-J durch Hugging Face Transformers
    generator = pipeline('text-generation', model='EleutherAI/gpt-j-6B')

    # Kombiniere Frage und Kontext in einem Prompt
    prompt_template = f"Question: {question}\nContext: {context}\nAnswer:"

    # Generiere die Antwort
    result = generator(prompt_template, max_length=200, temperature=0.7, num_return_sequences=1)
    
    return result[0]['generated_text']

# Hauptfunktion zum Ausführen des Projekts
def main(pdf_path, question):
    # 1. Extrahiere Text aus dem PDF
    text = extract_text_from_pdf(pdf_path)
    
    # 2. Text in Chunks aufteilen
    chunks = split_text_into_chunks(text)
    
    # 3. Embeddings erstellen
    embeddings, embeddings_model = create_embeddings(chunks)
    
    # 4. FAISS Index erstellen
    index = create_faiss_index(embeddings)
    
    # 5. Frage beantworten und Chunks anzeigen
    context, closest_chunks = find_answer_in_pdf(question, chunks, index, embeddings_model)
    
    # 6. Antwort generieren basierend auf dem abgerufenen Kontext mit GPT-J
    answer = generate_answer(question, context)
    
    return answer, closest_chunks

# Ausführen
pdf_path = "tsl-10k-report.pdf"  # Pfad zu deinem PDF-Dokument
question = "Give me three company risks"
answer, retrieved_chunks = main(pdf_path, question)

# Ausgabe der Antwort
print("Antwort auf die Frage:")
print(answer)

Abgerufene Chunks für die Frage:
Chunk 1:
ation with the SEC. The information posted on our website
is not incorporated by reference into this Annual Report on Form 10-K.
ITEM 1A. RISK FACTORS
You should carefully consider the risks described below together with the other information set forth in this report, which could materially affect our
business, financial condition and future results. The risks described below are not the only risks facing our company. Risks and uncertainties not currently
known to us or that we currently deem to be immaterial also may materially adversely affect our business, financial condition and operating results.
Risks Related to Our Ability to Grow Our Business
We may experience delays in launching and ramping the production of our products and features, or we may be unable to control
our manufacturing costs.
We have previously experienced and may in the future experience launch and production ramp delays for new products and features. For example,
we enc

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]


KeyboardInterrupt

