First things first, let's import the FAQ data from Zoomcamp. Interesting is that the data is large (~300pages)

In [1]:
import requests

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

The documents are, basically, a list of questions and answers not yet vectorized. (Points in "space")

In [2]:
import pandas as pd

df = pd.DataFrame(documents, columns=["course", "section", "question", "text"])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


So, a simple vectorization would be to use the TF-IDF vectorizer and count vectorizer (sci-kit method).

Term Frequency-Inverse Document Frequency ->  "A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus."

The CountVectorizer is a simple method to vectorize the text. It counts the number of times a word appears in a document, a "Bag of Words" model.


In [3]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()
print(names)

['15th' '2024' 'cloud' 'course' 'date' 'github' 'google' 'homeworks' 'jan'
 'listed' 'participation' 'prerequisites' 'python' 'registration'
 'required' 'setup' 'start' 'starts' 'submit']


In [5]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


Checking the documents, to see a bigger picture. Stop-words is for removing common words like "the", "is", "and", etc and min_df is for removing words that appear in less than X documents.
Here we need to filter and them change the text to a list, not use as dict or dataframe.

In [6]:
cv = CountVectorizer(stop_words="english", min_df=5)
X = cv.fit_transform(list(df.text))

names = cv.get_feature_names_out()
print("\n\n Word from the documents: ", names)

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.tail()



 Word from the documents:  ['01' '02' '03' ... 'youtube' 'zip' 'zoomcamp']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoomcamp,0,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
X.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1]], shape=(948, 1333))

Column is for quantity of times a word appears in a document, and the row is for the words.
Now, let's try to vectorize a query and see the result.

In [8]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], shape=(1, 1333))

Query_dict is for the query, and doc_dict is for the documents. Small ones kkk

In [9]:
query_dict = dict(zip(names, q.toarray()[0], strict=False))
query_dict

doc_dict = dict(zip(names, X.toarray()[1], strict=False))
doc_dict

{'01': np.int64(0),
 '02': np.int64(0),
 '03': np.int64(0),
 '04': np.int64(0),
 '05': np.int64(0),
 '06': np.int64(0),
 '09': np.int64(0),
 '10': np.int64(0),
 '100': np.int64(0),
 '11': np.int64(0),
 '12': np.int64(0),
 '127': np.int64(0),
 '13': np.int64(0),
 '14': np.int64(0),
 '15': np.int64(0),
 '16': np.int64(0),
 '17': np.int64(0),
 '19': np.int64(0),
 '1st': np.int64(0),
 '20': np.int64(0),
 '2019': np.int64(0),
 '2020': np.int64(0),
 '2021': np.int64(0),
 '2022': np.int64(0),
 '2023': np.int64(0),
 '2024': np.int64(0),
 '21': np.int64(0),
 '22': np.int64(0),
 '24': np.int64(0),
 '25': np.int64(0),
 '2pacx': np.int64(0),
 '30': np.int64(0),
 '35': np.int64(0),
 '403': np.int64(0),
 '42': np.int64(0),
 '50': np.int64(0),
 '5000': np.int64(0),
 '5431': np.int64(0),
 '5432': np.int64(0),
 '60': np.int64(0),
 '600': np.int64(0),
 '7077': np.int64(0),
 '80': np.int64(0),
 '8080': np.int64(0),
 '8888': np.int64(0),
 '9696': np.int64(0),
 'abhijit': np.int64(0),
 'able': np.int64(0),

We are looking for words that are present in both query and document and, after that, multiply the values in a dot product. -> Matching Score.ScribdDocument

**** Revisar algébra linear.

In [10]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=["query", "doc"]).T

(df_qd["query"] * df_qd["doc"]).sum()

X.dot(q.T).toarray()

array([[ 5],
       [ 0],
       [ 0],
       [ 1],
       [ 1],
       [ 0],
       [ 0],
       [ 2],
       [ 0],
       [ 0],
       [ 0],
       [ 3],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 0],
       [ 0],
       [ 1],
       [ 1],
       [ 0],
       [ 1],
       [ 2],
       [ 1],
       [ 0],
       [ 1],
       [ 0],
       [ 3],
       [ 3],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 2],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 1],
       [ 2],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 6],
       [ 2],
       [ 1],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 0],
       [ 0],
       [ 0],
       [ 1],
       [ 0],
       [ 0],
       [ 1],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 1],

Even better is use cosine similarity.

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X, q)

array([[0.28629917],
       [0.        ],
       [0.        ],
       [0.11952286],
       [0.09759001],
       [0.        ],
       [0.        ],
       [0.21081851],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.27386128],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.11952286],
       [0.        ],
       [0.        ],
       [0.08032193],
       [0.1490712 ],
       [0.        ],
       [0.09325048],
       [0.2236068 ],
       [0.1118034 ],
       [0.        ],
       [0.05679618],
       [0.        ],
       [0.27386128],
       [0.20701967],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.09534626],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01957401],
       [0.06201737],
       [0.07352146],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.047

Now lets try the tdifVectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

fields = ["section", "question", "text"]
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words="english", min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers["text"].get_feature_names_out()
matrices["text"]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [13]:
matrices["text"].todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.34048649],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.12881469]], shape=(948, 2118))

Trying to search for a query in the documents.


In [14]:
query = "I just signed up. Is it too late to join the course?"

q = transformers["text"].transform([query])
score = cosine_similarity(matrices["text"], q).flatten()

In [18]:
import numpy as np

# Set pandas display option to show full text
pd.set_option("display.max_colwidth", None)

idx = np.argsort(-score)[:10]
for i in idx:
    print(df.iloc[i].text)
    print("\n\n")

GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites



Star the repo! Share it with friends if you find it useful ❣️
Create a PR if you see you can improve the text or the structure of the repository.



All the main videos are stored in the Main “DATA ENGINEERING” playlist (no year specified). The Github repository has also been updated to show each video with a thumbnail, that would bring you directly to the same playlist below.
Below is the MAIN PLAYLIST’. And then you refer to the year specific playlist for additional videos for that year like for office hours videos etc. Also find this playlist pinned to the slack channel.
h
ttps://youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb&si=NspQhtZhZQs1B9F-



There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:
Data-Engineering (Jan - Apr)
MLOps (May - Aug)
Machine Learning (Sep - Jan)
There's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as 

Boosting one of the fields ('question') and filtering for only data-engineering course.


In [16]:
boost = {"question": 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

filters = {"course": "data-engineering-zoomcamp"}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient="records")

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Which playlist on YouTube should I refer to?',
  'text': 'All the main videos are stored in the Main “DATA ENGINEERING” playlist (no year specified). The Github repository has also been updated to show each video with a thumbnail, that would bring you directly to the same playlist below.\nBelow is the MAIN PLAYLIST’. And then you refer to the

Final class with everything together.

In [19]:
class TextSearch:
    def __init__(self, text_fields):
        # Initialize the TextSearch class with specified text fields for searching
        self.text_fields = text_fields
        self.matrices = {}  # Store TF-IDF matrices for each text field
        self.vectorizers = {}  # Store vectorizers for each text field

    def fit(self, records, vectorizer_params={}):
        # Fit the model to the provided records and initialize vectorizers
        self.df = pd.DataFrame(records)  # Convert records to a DataFrame

        for f in self.text_fields:
            # Create and fit a TF-IDF vectorizer for each text field
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])  # Transform the text data into TF-IDF matrix
            self.matrices[f] = X  # Store the TF-IDF matrix
            self.vectorizers[f] = cv  # Store the vectorizer

    def search(self, query, n_results=10, boost={}, filters={}):
        # Search for the query in the text fields and return the top n_results
        score = np.zeros(len(self.df))  # Initialize score array for documents

        for f in self.text_fields:
            # Calculate the score for each text field based on the query
            b = boost.get(f, 1.0)  # Get boost value for the field, default to 1.0
            q = self.vectorizers[f].transform([query])  # Transform the query into TF-IDF vector
            s = cosine_similarity(self.matrices[f], q).flatten()  # Compute cosine similarity
            score = score + b * s  # Update the score with boosted similarity

        for field, value in filters.items():
            # Apply filters to the score based on specified criteria
            mask = (self.df[field] == value).values  # Create a mask for filtering
            score = score * mask  # Apply the mask to the score

        idx = np.argsort(-score)[:n_results]  # Get indices of top n_results based on score
        results = self.df.iloc[idx]  # Retrieve the corresponding documents
        return results.to_dict(orient="records")  # Return results as a list of dictionaries

In [20]:
index = TextSearch(text_fields=["section", "question", "text"])
index.fit(documents)

index.search(
    query="I just singned up. Is it too late to join the course?",
    n_results=5,
    boost={"question": 3.0},
    filters={"course": "data-engineering-zoomcamp"},
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

Embeddings for vector search.

- Letters to numbers > dense vectors 
- Capture similiarity in words
- Deals with dimensionality
- Quite used in ML


First with SVD (Singular Value Decomposition)

Is a lossy compression algorithm, we cant recompose things 100% like before.

In [21]:
from sklearn.decomposition import TruncatedSVD

# Embedding the text with SVD

X = matrices["text"]
cv = transformers["text"]

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.08799711, -0.07530437, -0.10187671,  0.05184514,  0.05361936,
       -0.05785097,  0.01803765,  0.02266061, -0.2123822 , -0.31963146,
        0.1074095 ,  0.09693883, -0.0601929 , -0.05679519,  0.03724432,
       -0.08691432])

In [22]:
# embedding the query
query = "I just singned up. Is it too late to join the course?"

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353469, -0.03077248, -0.04460068,  0.01204782,  0.02661174,
       -0.0493612 ,  0.0110709 ,  0.01507618, -0.12524802, -0.1726674 ,
        0.07156269,  0.07337353, -0.04437011, -0.03255806,  0.02745311,
       -0.04865993])

In [23]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.12250171074309504)

In [24]:
# Getting the score and results
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\n

NFM (Non-negative Matrix Factorization)

SVD create negatives values, NFM is a non-negative matrix factorization algorithm that creates only positive values. Which is easier to interpret.

In [25]:
from sklearn.decomposition import NMF

# Embedding the text with NFM
nmf = NMF(n_components=16)  # n_components is the number of dimensions of the embedding
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.30539234,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [26]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00114928, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17066128,
       0.        , 0.        , 0.        , 0.        , 0.00068032,
       0.        ])

In [27]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'Please choose th

Now with heavy workers!

All of the transformers i saw were based on bag of words. So no word order, which is a problem. Now let's try to use transformers with word order.

In [40]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [41]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes",
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

In [42]:
# Compute the embeddings and the compress
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [43]:
X_emb = sentence_embeddings.numpy()

In [44]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [45]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i : i + n]
        result.append(batch)
    return result

In [47]:
from tqdm.auto import tqdm

texts = df["text"].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state

        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

  0%|          | 0/119 [00:00<?, ?it/s]

In [48]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)

    all_embeddings = []

    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state

            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)

    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings


X_text = compute_embeddings(df["text"].tolist())

  0%|          | 0/119 [00:00<?, ?it/s]

In [49]:
X_text

array([[-0.00456304, -0.11667512,  0.6274718 , ..., -0.03659191,
         0.10031676,  0.02927125],
       [-0.1423361 , -0.19853921,  0.28455415, ..., -0.01139052,
        -0.1539977 ,  0.0953508 ],
       [ 0.19672242, -0.08461309,  0.2820051 , ...,  0.1139587 ,
        -0.06448027, -0.01282615],
       ...,
       [-0.28217432, -0.33324352,  0.29784998, ..., -0.35042733,
         0.03266049,  0.09537259],
       [-0.428071  , -0.39468753,  0.30941996, ..., -0.05943284,
        -0.12965173,  0.0788705 ],
       [-0.16892129, -0.25146273,  0.47843292, ..., -0.18535416,
        -0.16108926,  0.27272925]], shape=(948, 768), dtype=float32)