In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [4]:
df = df[df['course'] == 'data-engineering-zoomcamp']
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


In [5]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(docs_example)

In [7]:
vocab = vectorizer.get_feature_names_out()
vocab

array(['catalog', 'cloud', 'course', 'details', 'end', 'google',
       'homework', 'january', 'listed', 'month', 'prerequisites',
       'python', 'register', 'setup', 'submit'], dtype=object)

In [8]:
X = vectorizer.transform(docs_example)
X.toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]])

In [9]:
df_docs = pd.DataFrame(X.toarray(), columns=vocab)
df_docs

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0
1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0
2,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1
3,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0
4,0,1,1,0,0,1,0,1,0,0,0,1,0,1,0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(docs_example)

In [11]:
vocab = vectorizer.get_feature_names_out()
vocab

array(['catalog', 'cloud', 'course', 'details', 'end', 'google',
       'homework', 'january', 'listed', 'month', 'prerequisites',
       'python', 'register', 'setup', 'submit'], dtype=object)

In [12]:
X = vectorizer.transform(docs_example)
X.toarray()

array([[0.        , 0.        , 0.32842678, 0.68924048, 0.        ,
        0.        , 0.        , 0.32842678, 0.        , 0.        ,
        0.        , 0.        , 0.55607488, 0.        , 0.        ],
       [0.56750154, 0.        , 0.27041752, 0.        , 0.        ,
        0.        , 0.        , 0.27041752, 0.56750154, 0.        ,
        0.45785667, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.22578084, 0.        , 0.47382645,
        0.        , 0.47382645, 0.22578084, 0.        , 0.47382645,
        0.        , 0.        , 0.        , 0.        , 0.47382645],
       [0.        , 0.        , 0.35959372, 0.        , 0.        ,
        0.        , 0.        , 0.35959372, 0.        , 0.        ,
        0.6088451 , 0.        , 0.6088451 , 0.        , 0.        ],
       [0.        , 0.47382645, 0.22578084, 0.        , 0.        ,
        0.47382645, 0.        , 0.22578084, 0.        , 0.        ,
        0.        , 0.47382645, 0.        , 

In [13]:
df_docs = pd.DataFrame(X.toarray(), columns=vocab)
df_docs

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0.0,0.0,0.328427,0.68924,0.0,0.0,0.0,0.328427,0.0,0.0,0.0,0.0,0.556075,0.0,0.0
1,0.567502,0.0,0.270418,0.0,0.0,0.0,0.0,0.270418,0.567502,0.0,0.457857,0.0,0.0,0.0,0.0
2,0.0,0.0,0.225781,0.0,0.473826,0.0,0.473826,0.225781,0.0,0.473826,0.0,0.0,0.0,0.0,0.473826
3,0.0,0.0,0.359594,0.0,0.0,0.0,0.0,0.359594,0.0,0.0,0.608845,0.0,0.608845,0.0,0.0
4,0.0,0.473826,0.225781,0.0,0.0,0.473826,0.0,0.225781,0.0,0.0,0.0,0.473826,0.0,0.473826,0.0


In [14]:
query = "Do I need to know python to sign up for the January course?"

q = vectorizer.transform([query])
q.toarray()

array([[0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.829279  , 0.        , 0.        , 0.        ]])

In [15]:
X.dot(q.T).toarray()

array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q)

array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [17]:
fields = ['section', 'question', 'text']
vectorizers = {}
matrices = {}

for f in fields:
    vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
    X = vectorizer.fit_transform(df[f])
    vectorizers[f] = vectorizer
    matrices[f] = X

In [18]:
import numpy as np

query = "I just signed up. Is it too late to join the course?"

score = np.zeros(len(df))

weights = {
    'question': 3
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]
    f_score = cosine_similarity(X, q).flatten()
    w = weights.get(f, 1.0)
    score = score + w * f_score

In [19]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

score

array([3.57177849, 3.49747389, 2.71964586, 2.0013156 , 3.49747389,
       3.49747389, 2.43628975, 3.16265041, 2.72258503, 3.49747389,
       2.66728988, 2.07227974, 0.49747389, 0.49747389, 0.49747389,
       0.61047833, 0.49747389, 2.64463111, 0.57208966, 0.49747389,
       0.49747389, 0.49747389, 0.82696731, 0.61177952, 0.49747389,
       0.49747389, 0.49747389, 0.78365402, 0.62563217, 0.49747389,
       0.49747389, 0.49747389, 0.49747389, 1.72801537, 3.49747389,
       1.99612322, 0.49747389, 0.49747389, 0.49747389, 0.53195187,
       0.54773693, 1.9945749 , 0.49747389, 0.54333058, 0.        ,
       0.        , 0.        , 0.        , 0.03011098, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.07223539, 0.        , 0.01178192,
       0.        , 0.        , 0.        , 0.        , 0.0631524 ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [20]:
idx = np.argsort(score)[-5:]

In [21]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


In [22]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
vectorizer = vectorizers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [23]:
print(X[0].shape)
print(X_emb[0].shape)

(1, 1177)
(16,)


In [24]:
query = 'I just signed up. Is it too late to join the course?'

q = vectorizer.transform([query])
q_emb = svd.transform(q)
q_emb[0]


array([ 0.04598527, -0.01550891, -0.02645552,  0.0093061 , -0.01224477,
       -0.00482555,  0.04146645,  0.02666205, -0.05235891,  0.15026326,
       -0.08092589,  0.08231103,  0.08393497,  0.06597712, -0.0119913 ,
        0.04952974])

In [25]:
score = cosine_similarity(X_emb, q_emb).flatten()
idx = np.argsort(-score)[:5]
df.iloc[idx]['text']

0     The purpose of this document is to capture fre...
7     Yes, we will keep all the materials after the ...
11    No, you can only get a certificate if you fini...
28    Yes, you can. Just remember to adapt all the i...
32    Yes, you can use any tool you want for your pr...
Name: text, dtype: object

In [26]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.00226616, 0.        , 0.        ,
       0.        , 0.00124049, 0.        , 0.        , 0.2814084 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [27]:
q_emb = nmf.transform(q)
q_emb[0]

array([8.92049506e-05, 3.90202215e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.89971621e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [28]:
score = cosine_similarity(X_emb, q_emb).flatten()
idx = np.argsort(-score)[:5]
df.iloc[idx]['text']

11     No, you can only get a certificate if you fini...
395    Each submitted project will be evaluated by 3 ...
7      Yes, we will keep all the materials after the ...
2      Yes, even if you don't register, you're still ...
0      The purpose of this document is to capture fre...
Name: text, dtype: object

In [29]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [30]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [32]:
with torch.no_grad():
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [None]:
hidden_states.shape # shape of (documents, tokens, embeddings)

torch.Size([2, 15, 768])

In [None]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape # shape of (documents, embeddings)

torch.Size([2, 768])