In [41]:
import requests

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

In [42]:
import pandas as pd

df = pd.DataFrame(documents, columns=["course", "section", "question", "text"])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [43]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

In [55]:
X = cv.fit_transform(df.text)

In [56]:
cv.get_feature_names_out()

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [57]:
names = cv.get_feature_names_out()
names

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [58]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000000e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
斜杠,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
查找和替换,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要了解键盘快捷键,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要启用屏幕阅读器支持,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
documents = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
]

In [67]:
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer(stop_words="english")
# X = cv.fit_transform(documents)

# names = cv.get_feature_names_out()

# df_docs = pd.DataFrame(X.toarray(), columns=names).T
# df_docs

In [66]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# cv = TfidfVectorizer(stop_words='english')
# X = cv.fit_transform(documents)

# names = cv.get_feature_names_out()

# df_docs = pd.DataFrame(X.toarray(), columns=names).T
# df_docs.round(2)

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words="english", min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [73]:
names

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [70]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [85]:
query_df = pd.DataFrame.sparse.from_spmatrix(q, columns=names).T
query_df[query_df[0] > 0]

Unnamed: 0,0
course,0.381482
know,0.560827
need,0.297968
python,0.314414
sign,0.593552


In [76]:
query_dict = dict(zip(names, q.toarray()[0]))
non_zero_query_dict = {k: v for k, v in query_dict.items() if v != 0}
non_zero_query_dict

{'course': np.float64(0.38148200594064524),
 'know': np.float64(0.5608269127690405),
 'need': np.float64(0.29796783250107517),
 'python': np.float64(0.31441356049301333),
 'sign': np.float64(0.5935519664108326)}

In [80]:
doc_dict = dict(zip(names, X.toarray()[1]))
non_zero_doc_dict = {k: v for k, v in doc_dict.items() if v != 0}
non_zero_doc_dict

{'data': np.float64(0.3127766226016382),
 'datatalksclub': np.float64(0.5316383823591385),
 'engineering': np.float64(0.5401030373894639),
 'github': np.float64(0.3792268345828722),
 'zoomcamp': np.float64(0.4289605246306481)}

In [99]:
from sklearn.metrics.pairwise import cosine_similarity

score = cosine_similarity(X, q).flatten()
score

array([0.19464486, 0.        , 0.        , 0.06011641, 0.04932915,
       0.        , 0.        , 0.13477565, 0.        , 0.        ,
       0.        , 0.15899187, 0.        , 0.        , 0.        ,
       0.07431408, 0.        , 0.        , 0.05779673, 0.07243428,
       0.        , 0.05174293, 0.16373635, 0.08076031, 0.        ,
       0.09755254, 0.        , 0.21069625, 0.12067781, 0.        ,
       0.        , 0.        , 0.        , 0.06381749, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00910541,
       0.02835681, 0.05480112, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02469964, 0.05129386, 0.06013439,
       0.05252658, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04169018, 0.        , 0.        , 0.        , 0.0075293 ,
       0.        , 0.        , 0.01971463, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [106]:
np.argsort(score)

array([473, 563, 564, 566, 567, 568, 569, 570, 571, 572, 574, 575, 576,
       578, 579, 580, 581, 582, 583, 584, 562, 561, 560, 559, 530, 532,
       533, 534, 535, 536, 537, 538, 542, 585, 544, 548, 549, 550, 551,
       552, 553, 555, 556, 558, 546, 586, 590, 594, 634, 635, 636, 637,
       638, 640, 641, 643, 644, 631, 645, 647, 649, 650, 651, 652, 653,
       654, 655, 657, 646, 528, 630, 627, 595, 597, 600, 601, 602, 604,
       605, 606, 607, 628, 608, 612, 613, 614, 615, 616, 618, 621, 622,
       626, 611, 527, 526, 525, 422, 423, 426, 427, 428, 429, 430, 432,
       437, 421, 441, 443, 444, 447, 453, 460, 461, 462, 463, 466, 442,
       467, 420, 418, 385, 386, 387, 389, 390, 392, 397, 399, 400, 419,
       402, 405, 407, 408, 409, 410, 412, 414, 416, 417, 404, 658, 468,
       472, 499, 501, 504, 505, 506, 507, 508, 509, 510, 498, 512, 514,
       515, 516, 517, 518, 519, 520, 523, 524, 513, 471, 497, 495, 946,
       474, 475, 476, 477, 478, 479, 480, 481, 496, 482, 486, 48

In [105]:
import numpy as np

idx = np.argsort(score)[-5:]

df.iloc[idx]["text"].values

array(['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
       'You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.\nFor everything in the course, there’s a local alternative. You could even do the whole course locally.',
       'Technically, yes. Advisable? Not really. Reasons:\nSome homework(s) asks for specific python library versions.\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be pe

### Vectorizing all the documents

In [108]:
fields = ["section", "question", "text"]
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words="english", min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers["text"].get_feature_names_out()

array(['001', '01', '02', ..., 'zones', 'zoom', 'zoomcamp'], dtype=object)

In [112]:
n = len(df)
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"

for f in fields:
    q = transformers[f].transform([query])
    score += cosine_similarity(matrices[f], q).flatten()

In [113]:
idx = np.argsort(score)[-5:]

df.iloc[idx]["text"].values

array(["There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
       'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
       'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
       'All the main videos are stored in the Main “DATA ENGINEERING” pla

we will filter our df to get better results

In [116]:
filters = {"course": "data-engineering-zoomcamp"}

for field, value in filters.items():
    mask = (df[field] == value).astype(int)

In [121]:
masked_score = score * mask
masked_score

0      1.680139
1      1.495124
2      1.232533
3      0.980091
4      1.495124
         ...   
943    0.000000
944    0.000000
945    0.000000
946    0.000000
947    0.000000
Name: course, Length: 948, dtype: float64

In [123]:
masked_idx = np.argsort(masked_score)[-5:]

df.iloc[masked_idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


In [124]:
n = len(df)
score = np.zeros(n)

query = "I just discovered the course, is it too late to join?"
boosts = {"question": 3}

for f in fields:
    q = transformers[f].transform([query])
    f_score = cosine_similarity(matrices[f], q).flatten()
    boost = boosts.get(f, 1)

    score = score + f_score * boost

In [125]:
score

array([3.38295811, 3.49512426, 2.70735166, 1.68424985, 3.49512426,
       3.49512426, 1.26102286, 3.03149832, 2.67242848, 3.49512426,
       2.45169338, 1.43004364, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 1.68417129, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.77533273, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.981508  , 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     