# **PDF Laden**

In [None]:
from google.colab import drive
from langchain.document_loaders import PyPDFLoader

def pdf_loading():
  drive.mount('/content/drive')
  file_path = '/content/drive/MyDrive/yourfile.pdf'
  pdf_loader = PyPDFLoader(file_path) # pdf_loader Objekt erstellt
  return pdf_loader

pdf_loader = pdf_loading()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **PDF tokenisieren und in Chunks (Textabschnitte) aufteilen**

In [None]:
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

def tokenizing_chunking(pdf_loader):
  gte_tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-small') # Tokenizer laden
  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( # Text-Splitting erstellen und konfigurieren
    tokenizer=gte_tokenizer,
    chunk_size=400, # Chunkgröße auf 400 Tokens beschränkt
    chunk_overlap=20 # Optional: Ein Überschneidungsbereich von 20 Tokens
  )
  chunks = pdf_loader.load_and_split(text_splitter)
  return chunks

chunks = tokenizing_chunking(pdf_loader)

In [None]:
print(type(chunks)) # Chunks sind in Liste
print(len(chunks))

<class 'list'>
402


In [None]:
# Beispiel
chunks[0]

Document(metadata={'source': '/content/drive/MyDrive/Meine Dateien/tsl-10k-report.pdf', 'page': 0}, page_content='UNITED\tSTATES\nSECURITIES\tAND\tEXCHANGE\tCOMMISSION\nWashington,\tD.C.\t20549\nFORM\t\n10-K\n(Mark\tOne)\nx\nANNUAL\tREPORT\tPURSUANT\tTO\tSECTION\t13\tOR\t15(d)\tOF\tTHE\tSECURITIES\tEXCHANGE\tACT\tOF\t1934\nFor\tthe\tfiscal\tyear\tended\t\nDecember\t31\n,\t2023\nOR\no\nTRANSITION\tREPORT\tPURSUANT\tTO\tSECTION\t13\tOR\t15(d)\tOF\tTHE\tSECURITIES\tEXCHANGE\tACT\tOF\t1934\nFor\tthe\ttransition\tperiod\tfrom\t_________\tto\t_________\nCommission\tFile\tNumber:\t\n001-34756\nTesla,\tInc.\n(Exact\tname\tof\tregistrant\tas\tspecified\tin\tits\tcharter)\nDelaware\n91-2197729\n(State\tor\tother\tjurisdiction\tof\nincorporation\tor\torganization)\n(I.R.S.\tEmployer\nIdentification\tNo.)\n1\tTesla\tRoad\nAustin\n,\t\nTexas\n78725\n(Address\tof\tprincipal\texecutive\toffices)\n(Zip\tCode)\n(\n512\n)\t\n516-8177\n(Registrant’s\ttelephone\tnumber,\tincluding\tarea\tcode)\nSecurities

# **Embedding Modell laden und Embeddings erstellen**

In [None]:
# Embedding Modell
model_name = 'thenlper/gte-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
def calculate_embeddings(chunks, tokenizer, model):
  inputs = tokenizer(chunks, return_tensors='pt')
  with torch.no_grad():  # Deaktiviert das Gradient Tracking
    embeddings = model(**inputs)
    embedding_dimension = embeddings.last_hidden_state.shape[-1]
    embedding = embeddings.last_hidden_state.mean(dim=1)
  return embedding

# **Vektordatenbank erstellen und Embeddings für Chunks einfügen**

In [None]:
import pinecone
from langchain.vectorstores import Pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

In [None]:
# Vektor Datenabnk Index erstellen
api_key = 'Your-API-Key' # Pinecone API-Schlüssel
index_name = 'tsl'

def create_vector_index(api_key, index_name):
    pc = Pinecone(api_key=api_key)  # Pinecone-Objekt erstellen
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' wurde erfolgreich erstellt.")
    return pc

pc = create_vector_index(api_key, index_name)

Index 'tsl' wurde erfolgreich erstellt.


In [None]:
# Embeddings für Chunks erstellen
def get_embeddings_from_chunks(chunks, tokenizer, model):
  embeddings_list = []
  for chunk in chunks:
        embedding = calculate_embeddings(chunk.page_content, tokenizer, model) # Bereits definierte Embedding-Funktion
        embeddings_list.append(embedding)
  return torch.cat(embeddings_list, dim=0).cpu().numpy()  # Alle Embeddings zu einem einzigen Tensor verbinden und in NumPy Array umwandeln

embeddings = get_embeddings_from_chunks(chunks, tokenizer, model) # Funktion aufrufen
print(f"Berechnete Embeddings für {len(embeddings)} Chunks.")

Berechnete Embeddings für 402 Chunks.


In [None]:
# Embedding Upsert in Pinecone
def pinecone_upsert(pc, index_name, embeddings, namespace="your-namespace"):
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

    index = pc.Index(index_name)

    # Erstelle eine Liste von Vektorobjekten (mit IDs) für den Upsert in Pinecone
    vectors = [
        {"id": f"doc_{i}", "values": embedding.tolist()} for i, embedding in enumerate(embeddings)
    ]

    # Füge die Embeddings (Vektoren) in den Pinecone-Index ein
    index.upsert(
        vectors=vectors,
        namespace=namespace  # Namespace verwenden, wenn nötig
    )

    print(f"Erfolgreich {len(vectors)} Vektoren in Pinecone eingefügt.")

pinecone_upsert(pc, index_name, embeddings)

Erfolgreich 402 Vektoren in Pinecone eingefügt.


# **Spezifische Datenbank-Abfrage**

In [None]:
def query_pinecone_and_retrieve_chunks(query, tokenizer, model, index, chunks, top_k=5, namespace="your-namespace"):
    query_embedding = calculate_embeddings(query, tokenizer, model)

    query_results = index.query(
        vector=query_embedding.cpu().numpy().tolist()[0],  # Konvertiere das Embedding in eine Liste
        top_k=top_k,  # Anzahl der zurückzugebenden ähnlichsten Dokumente
        include_values=True,
        namespace=namespace
    )

    matched_chunks = []
    for result in query_results['matches']:
        doc_id = result['id']
        chunk_index = int(doc_id.split("_")[1])  # Annahme: IDs sind im Format "doc_<chunk_index>"
        matched_chunks.append(chunks[chunk_index].page_content)

    def clean_chunk(chunk):
        cleaned_text = ' '.join(chunk.split())
        return cleaned_text

    cleaned_chunks = []
    for i, chunk in enumerate(matched_chunks):
        cleaned_chunk = clean_chunk(chunk)
        cleaned_chunks.append(cleaned_chunk)
        print(f"Chunk {i+1}:")
        print(cleaned_chunk)
        print("-" * 50)

    return cleaned_chunks

query = "What was Tesla's total revenues in 2023?"
cleaned_chunks = query_pinecone_and_retrieve_chunks(query, tokenizer, model, index, chunks, top_k=5, namespace="your-namespace")


Chunk 1:
Tesla, Inc. Consolidated Statements of Operations (in millions, except per share data) Year Ended December 31, 2023 2022 2021 Revenues Automotive sales $ 78,509 $ 67,210 $ 44,125 Automotive regulatory credits 1,790 1,776 1,465 Automotive leasing 2,120 2,476 1,642 Total automotive revenues 82,419 71,462 47,232 Energy generation and storage 6,035 3,909 2,789 Services and other 8,319 6,091 3,802 Total revenues 96,773 81,462 53,823 Cost of revenues Automotive sales 65,121 49,599
--------------------------------------------------
Chunk 2:
Tesla, Inc. Consolidated Statements of Cash Flows (in millions) Year Ended December 31, 2023 2022 2021 Cash Flows from Operating Activities Net income $ 14,974 $ 12,587 $ 5,644 Adjustments to reconcile net income to net cash provided by operating activities: Depreciation, amortization and impairment 4,667 3,747 2,911 Stock-based compensation 1,812 1,560 2,121 Inventory and purchase commitments write-downs 463 177 140 Foreign currency transaction n

# **Textgenerierung mit GPT 3.5**

In [None]:
def create_prompt(query, context_chunks):
    # Struktur des Prompts mit der Anfrage und mehreren Chunks als Kontext
    prompt = f"Question: {query}\n\nContext:\n"

    for i, chunk in enumerate(context_chunks):
        prompt += f"Context Chunk {i+1}:\n{chunk}\n\n"

    prompt += "Based on the provided context, generate a detailed answer for the question."
    return prompt

cleaned_chunks = [clean_chunk(chunk) for chunk in matched_chunks]
prompt = create_prompt(query, cleaned_chunks)
print(prompt)

Question: What was Tesla's total revenues in 2023?

Context:
Context Chunk 1:
Tesla, Inc. Consolidated Statements of Operations (in millions, except per share data) Year Ended December 31, 2023 2022 2021 Revenues Automotive sales $ 78,509 $ 67,210 $ 44,125 Automotive regulatory credits 1,790 1,776 1,465 Automotive leasing 2,120 2,476 1,642 Total automotive revenues 82,419 71,462 47,232 Energy generation and storage 6,035 3,909 2,789 Services and other 8,319 6,091 3,802 Total revenues 96,773 81,462 53,823 Cost of revenues Automotive sales 65,121 49,599

Context Chunk 2:
Tesla, Inc. Consolidated Statements of Cash Flows (in millions) Year Ended December 31, 2023 2022 2021 Cash Flows from Operating Activities Net income $ 14,974 $ 12,587 $ 5,644 Adjustments to reconcile net income to net cash provided by operating activities: Depreciation, amortization and impairment 4,667 3,747 2,911 Stock-based compensation 1,812 1,560 2,121 Inventory and purchase commitments write-downs 463 177 140 For

In [None]:
from openai import OpenAI
client = OpenAI(api_key='Your-API-Key')

completion = client.chat.completions.create(
    model="gpt-3.5-turbo",  # oder "gpt-4"
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ],
    max_tokens=300,  # Maximale Länge der Antwort
    temperature=0.7  # Steuert die Kreativität der Antwort
)

print(completion.choices[0].message.content)

Tesla's total revenues in 2023 were $96,773 million. This amount includes revenue from automotive sales, automotive regulatory credits, automotive leasing, energy generation and storage, as well as services and other sources. The breakdown of total revenues is as follows:

- Automotive sales: $78,509 million
- Automotive regulatory credits: $1,790 million
- Automotive leasing: $2,120 million
- Energy generation and storage: $6,035 million
- Services and other: $8,319 million

These figures sum up to the total revenue of $96,773 million for Tesla in 2023.
