In [1]:
import niquests
import numpy as np
import orjson
import polars as pl
import polars.selectors as cs
import torch
from IPython.display import display
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertTokenizer

In [2]:
docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = niquests.get(docs_url)
assert docs_response.content is not None
documents_raw = orjson.loads(docs_response.content)

documents = []

for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)


In [3]:
df = pl.DataFrame(documents, schema=["course", "section", "question", "text"])
df.head()


course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - I have registered for…","""You don't need it. You're acce…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What can I do before …","""You can start by installing an…"


In [4]:
df.tail()

course,section,question,text
str,str,str,str
"""mlops-zoomcamp""","""Module 6: Best practices""","""Github actions: Permission den…","""Problem description This is th…"
"""mlops-zoomcamp""","""Module 6: Best practices""","""Managing Multiple Docker Conta…","""Problem description When a doc…"
"""mlops-zoomcamp""","""Module 6: Best practices""","""AWS regions need to match dock…","""Problem description If you are…"
"""mlops-zoomcamp""","""Module 6: Best practices""","""Isort Pre-commit""","""Problem description Pre-commit…"
"""mlops-zoomcamp""","""Module 6: Best practices""","""How to destroy infrastructure …","""Problem description Infrastruc…"


In [5]:
cv = CountVectorizer(min_df=5)
cv.fit(df.get_column("text"))

In [6]:
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'],
      shape=(1524,), dtype=object)

In [7]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
]

cv = CountVectorizer(stop_words="english")
display(cv.fit(doc_examples))
display(cv.get_feature_names_out())

X = cv.transform(doc_examples)
X.todense()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [8]:
pl.from_numpy(X.toarray(), schema=list(cv.get_feature_names_out())).transpose(
    include_header=True
)

column,column_0,column_1,column_2,column_3,column_4
str,i64,i64,i64,i64,i64
"""15th""",1,0,0,0,0
"""2024""",1,0,0,0,0
"""cloud""",0,0,0,0,1
"""course""",1,0,0,0,1
"""date""",0,0,1,0,0
…,…,…,…,…,…
"""required""",0,0,0,1,0
"""setup""",0,0,0,0,1
"""start""",0,0,1,0,0
"""starts""",1,0,0,0,0


In [9]:
cv = TfidfVectorizer(stop_words="english", min_df=5)
X = cv.fit_transform(df.get_column("text"))

names = cv.get_feature_names_out()

df_docs = pl.from_numpy(X.toarray(), schema=list(names)).transpose(include_header=True)
df_docs.with_columns(cs.numeric().round(2))

column,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,…,column_911,column_912,column_913,column_914,column_915,column_916,column_917,column_918,column_919,column_920,column_921,column_922,column_923,column_924,column_925,column_926,column_927,column_928,column_929,column_930,column_931,column_932,column_933,column_934,column_935,column_936,column_937,column_938,column_939,column_940,column_941,column_942,column_943,column_944,column_945,column_946,column_947
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""01""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""02""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""03""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""04""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""05""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""yes""",0.0,0.0,0.28,0.0,0.0,0.0,0.21,0.2,0.15,0.0,0.0,0.0,0.0,0.26,0.0,0.0,0.25,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.47,0.12,0.0,0.37,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""yml""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.0
"""youtube""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""zip""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

In [11]:
query = "I just discovered the course, is it too late to join?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1333))

In [12]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [13]:
doc_dict = dict(zip(names, X.toarray()[2]))
doc_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [14]:
display(X)
display(q.T)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

<Compressed Sparse Column sparse matrix of dtype 'float64'
	with 3 stored elements and shape (1333, 1)>

In [15]:
X.dot(q.T)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 144 stored elements and shape (948, 1)>

In [16]:
print(df[0, "text"])

The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.


In [17]:
X.dot(q.T).todense()

matrix([[0.48049682],
        [0.        ],
        [0.        ],
        [0.2083882 ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.17557272],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15870689],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.09680922],
        [0.        ],
        [0.        ],
        [0.07529201],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.29986763],
        [0.10520675],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27447476],
        [0.12828407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.05163407],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.03156309],
        [0.04914818],
        [0.07138962],
        [0.        ],
        [0.04329773],
        [0.        ],
        [0

In [18]:
score = cosine_similarity(X, q).flatten()
score

array([0.48049682, 0.        , 0.        , 0.2083882 , 0.        ,
       0.        , 0.        , 0.17557272, 0.        , 0.        ,
       0.        , 0.15870689, 0.        , 0.        , 0.        ,
       0.09680922, 0.        , 0.        , 0.07529201, 0.        ,
       0.        , 0.        , 0.29986763, 0.10520675, 0.        ,
       0.        , 0.        , 0.27447476, 0.12828407, 0.        ,
       0.        , 0.        , 0.        , 0.05163407, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.03156309,
       0.04914818, 0.07138962, 0.        , 0.04329773, 0.        ,
       0.        , 0.        , 0.        , 0.02804374, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06739038, 0.        , 0.00980845,
       0.        , 0.        , 0.        , 0.        , 0.05820102,
       0.        , 0.        , 0.        , 0.        , 0.     

In [19]:
np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [20]:
print(df[449, "text"])

Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.


In [21]:
df[np.argsort(score)[-5:]]

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Environment - Do we really hav…","""It's up to you which platform …"
"""machine-learning-zoomcamp""","""General course-related questio…","""I’m new to Slack and can’t fin…","""Here’s how you join a in Slack…"
"""machine-learning-zoomcamp""","""General course-related questio…","""The course has already started…","""Yes, you can. You won’t be abl…"
"""machine-learning-zoomcamp""","""General course-related questio…","""I filled the form, but haven't…","""The process is automated now, …"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"


In [22]:
df.with_row_index().filter(pl.col("index").is_in(np.argsort(score)[-5:]))

index,course,section,question,text
u32,str,str,str,str
0,"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
22,"""data-engineering-zoomcamp""","""General course-related questio…","""Environment - Do we really hav…","""It's up to you which platform …"
440,"""machine-learning-zoomcamp""","""General course-related questio…","""I filled the form, but haven't…","""The process is automated now, …"
448,"""machine-learning-zoomcamp""","""General course-related questio…","""I’m new to Slack and can’t fin…","""Here’s how you join a in Slack…"
449,"""machine-learning-zoomcamp""","""General course-related questio…","""The course has already started…","""Yes, you can. You won’t be abl…"


In [23]:
fields = ["section", "question", "text"]

In [24]:
matrices = {}
vectorizers = {}

for f in fields:
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)
    X = tfidf_vect.fit_transform(df.get_column(f))
    matrices[f] = X
    vectorizers[f] = tfidf_vect

In [25]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [26]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [27]:
n = len(df)
n

948

In [28]:
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

boosts = {
    "question": 3,
    # "text": 0.5,
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(f, 1.0)

    score = score + boost * f_score


In [29]:
filters = {
    "course": "data-engineering-zoomcamp",
}

In [30]:
for field, value in filters.items():
    mask = df.select(pl.col(field).eq(value).cast(pl.Int32)).to_series().to_numpy()
    score = score * mask


In [31]:
idx = np.argsort(-score)[:5]
idx

array([ 7,  0, 34,  1,  9])

In [32]:
df[idx]

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I follow the cour…","""Yes, we will keep all the mate…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""How can we contribute to the c…","""Star the repo! Share it with f…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Which playlist on You…","""All the main videos are stored…"


In [33]:
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizers_params={}):
        self.df = pl.DataFrame(records)

        for f in self.text_fields:
            tfidf_vect = TfidfVectorizer(**vectorizers_params)
            X = tfidf_vect.fit_transform(self.df.get_column(f))
            self.matrices[f] = X
            self.vectorizers[f] = tfidf_vect

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (
                self.df.select(pl.col(field).eq(value).cast(pl.Int32))
                .to_series()
                .to_numpy()
            )
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df[idx]
        return results.to_dicts()

In [34]:
index = TextSearch(
    text_fields=["section", "question", "text"],
)
index.fit(documents)

index.search(
    query="I just signed up. Is it too late to join the course?",
    n_results=5,
    boost={"question": 3.0},
    filters={"course": "data-engineering-zoomcamp"},
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

In [35]:
index.matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3651 stored elements and shape (948, 86)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 8938 stored elements and shape (948, 2051)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 47683 stored elements and shape (948, 6711)>}

In [36]:
X = matrices["text"]
tfidf_vect = vectorizers["text"]

In [37]:
svd = TruncatedSVD(n_components=16, random_state=42)
X_emb = svd.fit_transform(X)

In [38]:
X_emb.shape

(948, 16)

In [39]:
X_emb[0]

array([ 0.09653121, -0.08200877, -0.10308123, -0.07856904,  0.06689444,
       -0.06415021,  0.01316869, -0.12176456,  0.23663404,  0.29786166,
        0.01626874, -0.05885793, -0.12924817,  0.06957957,  0.01152334,
        0.02763434])

In [40]:
query = "I just signed up. Is it too late to join the course?"

Q = tfidf_vect.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05790347, -0.0384329 , -0.0571645 , -0.02726475,  0.03868799,
       -0.06432162,  0.00576811, -0.07898891,  0.16532799,  0.18948992,
        0.0253943 , -0.06282296, -0.0936953 ,  0.04560308,  0.03032612,
        0.02670506])

In [41]:
X_emb[0].dot(Q_emb[0])

np.float64(0.14922954432399466)

In [42]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df[idx]

course,section,question,text
str,str,str,str
"""machine-learning-zoomcamp""","""Projects (Midterm and Capstone…","""What If I submitted only two p…","""If you have submitted two proj…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""machine-learning-zoomcamp""","""General course-related questio…","""Is it going to be live? When?""","""The course videos are pre-reco…"
"""machine-learning-zoomcamp""","""General course-related questio…","""The course has already started…","""Yes, you can. You won’t be abl…"
"""machine-learning-zoomcamp""","""General course-related questio…","""Can I submit the homework afte…","""No, it’s not possible. The for…"
"""mlops-zoomcamp""","""+-General course questions""","""What if my answer is not exact…","""Please choose the closest one …"
"""data-engineering-zoomcamp""","""General course-related questio…","""Homework - Are late submission…","""No, late submissions are not a…"
"""machine-learning-zoomcamp""","""General course-related questio…","""When does the next iteration s…","""The course is available in the…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Certificate - Can I follow the…","""No, you can only get a certifi…"
"""machine-learning-zoomcamp""","""General course-related questio…","""I filled the form, but haven't…","""The process is automated now, …"


In [43]:
nmf = NMF(n_components=16, random_state=42)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.13180157, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00032407, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [44]:
Q = tfidf_vect.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.08771685, 0.00206466, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00241716,
       0.        ])

In [45]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df[idx]

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""machine-learning-zoomcamp""","""Projects (Midterm and Capstone…","""What If I submitted only two p…","""If you have submitted two proj…"
"""machine-learning-zoomcamp""","""General course-related questio…","""Is it going to be live? When?""","""The course videos are pre-reco…"
"""machine-learning-zoomcamp""","""General course-related questio…","""The course has already started…","""Yes, you can. You won’t be abl…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Certificate - Can I follow the…","""No, you can only get a certifi…"
"""mlops-zoomcamp""","""+-General course questions""","""What if my answer is not exact…","""Please choose the closest one …"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""machine-learning-zoomcamp""","""General course-related questio…","""Can I submit the homework afte…","""No, it’s not possible. The for…"
"""machine-learning-zoomcamp""","""General course-related questio…","""What if I miss a session?""","""Everything is recorded, so you…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I follow the cour…","""Yes, we will keep all the mate…"


In [46]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [47]:
texts = [
    "Yes, we will keep all the materials after the course finishes."
    "You can follow the course at your own pace after it finishes."
]

encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,  2017,  2064,  3582,  1996,  2607,  2012,
          2115,  2219,  6393,  2044,  2009, 12321,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}

In [48]:
with torch.no_grad():
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [49]:
hidden_states.shape

torch.Size([1, 28, 768])

In [50]:
hidden_states[0]

tensor([[ 0.1527, -0.2317,  0.2326,  ..., -0.2538,  0.0960,  0.8343],
        [ 1.0805, -0.0902,  0.0966,  ..., -0.0667,  1.1029,  0.3612],
        [ 0.1055,  0.0228,  0.4672,  ..., -0.1463,  0.6463,  0.2812],
        ...,
        [ 0.7566,  0.0495,  0.0458,  ...,  0.1440, -0.3283, -0.0995],
        [-0.1611, -1.0057,  0.1365,  ...,  0.4239,  0.3926, -0.4111],
        [ 0.6630,  0.2336,  0.1120,  ...,  0.3665, -0.5953, -0.2555]])

In [51]:
hidden_states[0].shape

torch.Size([28, 768])

In [52]:
hidden_states.mean(dim=0).shape

torch.Size([28, 768])

In [53]:
hidden_states.mean(dim=1).shape

torch.Size([1, 768])

In [54]:
hidden_states.mean(dim=2).shape

torch.Size([1, 28])

In [55]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([1, 768])

In [56]:
sentence_embeddings

tensor([[ 2.3474e-01, -2.1953e-01,  3.4205e-01,  1.6793e-01,  3.1836e-01,
         -1.5911e-01, -8.2699e-02,  4.5001e-01, -1.9599e-01, -1.5678e-01,
          3.3149e-03, -8.5119e-02, -1.5414e-01,  3.8576e-02, -1.9298e-01,
          1.8156e-01,  1.5481e-02, -1.1937e-01, -3.9585e-02,  2.8981e-01,
         -3.8360e-03, -4.6815e-02,  6.1047e-02,  6.6230e-01,  1.9999e-01,
          9.0104e-02, -1.2856e-01,  1.8387e-01, -1.3683e-01, -2.2859e-01,
          1.2463e-01, -2.6219e-02, -3.8654e-02, -3.4193e-01,  1.6459e-01,
         -1.2974e-01, -4.3338e-02,  2.8364e-01, -2.8863e-01, -1.7718e-01,
         -6.1283e-01,  1.8049e-01, -1.8411e-01, -1.6350e-02,  1.5406e-01,
         -1.7085e-01,  6.5976e-01, -1.2185e-01,  6.0088e-02, -2.9487e-01,
         -4.0080e-01,  4.6322e-01, -2.0155e-02,  2.1050e-01,  2.9732e-01,
          4.0150e-01, -3.0129e-01, -1.9688e-01, -7.7482e-01, -2.0958e-02,
         -5.2081e-02, -2.7692e-02, -9.4784e-02, -3.7899e-02,  2.5932e-01,
          8.9261e-02,  2.1428e-01,  3.