In [3]:
import pandas as pd

In [4]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
pd.DataFrame(documents)

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp
...,...,...,...,...
943,Problem description\nThis is the step in the c...,Module 6: Best practices,Github actions: Permission denied error when e...,mlops-zoomcamp
944,Problem description\nWhen a docker-compose fil...,Module 6: Best practices,Managing Multiple Docker Containers with docke...,mlops-zoomcamp
945,Problem description\nIf you are having problem...,Module 6: Best practices,AWS regions need to match docker-compose,mlops-zoomcamp
946,Problem description\nPre-commit command was fa...,Module 6: Best practices,Isort Pre-commit,mlops-zoomcamp


In [7]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


### Limiting Search To Course Only

In [11]:
df[df.course == "data-engineering-zoomcamp"]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


### Vector Space

- Turn document into vector space
- term-document matrix
    - Rows document matrix
    - Column document matrix

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer()

In [14]:
cv.fit(df.text)

In [17]:
cv.get_feature_names_out()

array(['00', '00000000e', '0002', ..., '要了解键盘快捷键', '要启用屏幕阅读器支持', '请按ctrl'],
      dtype=object)

In [18]:
# document count
cv.get_feature_names_out().shape

(6711,)

### Bag Of Words

- We do not care of word order
- We only care if a word is present or not
- Word order is lost
- We use sparse matrixes for this(Bag of words)

In [19]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [32]:
cv = CountVectorizer(stop_words="english")
cv.fit(doc_examples)

In [33]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [34]:
x = cv.transform(doc_examples)

In [35]:
x

<5x19 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [36]:
x.todense()

matrix([[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]])

In [37]:
example_cv = pd.DataFrame(x.todense(), columns=cv.get_feature_names_out())
example_cv.head()

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


In [38]:
example_cv.T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


### TFIDF Term Frequency Inverse Document Frequency

- The less frequent a term is the less important the term is.
- TFIDF is a way to give more importance to more frequent words and less importance to less frequent words
- The more important a word is, the higher its frequency.

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(doc_examples)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs.round(2)

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,0.46,0.46,0.0,0.37,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0
1,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.0,0.58,0.0,0.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.58,0.58,0.0,0.0,0.0,0.0
4,0.0,0.0,0.46,0.37,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.46,0.0,0.0,0.0


In [49]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000000e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00021,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
斜杠,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
查找和替换,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要了解键盘快捷键,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
要启用屏幕阅读器支持,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
X

<948x6461 sparse matrix of type '<class 'numpy.int64'>'
	with 31723 stored elements in Compressed Sparse Row format>

In [66]:
query = "I just singned up. Is it too late to join the course?"

q = cv.transform([query])
q.toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [67]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'00': 0,
 '00000000e': 0,
 '0002': 0,
 '00021': 0,
 '001': 0,
 '009s': 0,
 '01': 0,
 '02': 0,
 '020': 0,
 '028879': 0,
 '02d': 0,
 '03': 0,
 '0315': 0,
 '04': 0,
 '04d': 0,
 '05': 0,
 '051': 0,
 '054': 0,
 '06': 0,
 '06_spark_sql': 0,
 '07': 0,
 '07cd': 0,
 '08': 0,
 '09': 0,
 '0ms': 0,
 '0x3c947bc5': 0,
 '0x7efe331cf790': 0,
 '0x7f797010a590': 0,
 '0x7fbaf2666280': 0,
 '0x800701bc': 0,
 '0xa0': 0,
 '0xff': 0,
 '0zw04wdetqo': 0,
 '10': 0,
 '100': 0,
 '1000': 0,
 '100000': 0,
 '100k': 0,
 '100m': 0,
 '100mb': 0,
 '101': 0,
 '1010101': 0,
 '1049089': 0,
 '1053': 0,
 '107': 0,
 '1078': 0,
 '10gb': 0,
 '11': 0,
 '111': 0,
 '111111111': 0,
 '1111111111': 0,
 '1128': 0,
 '114': 0,
 '1186': 0,
 '12': 0,
 '120': 0,
 '121': 0,
 '122': 0,
 '1221': 0,
 '123': 0,
 '1234': 0,
 '124': 0,
 '126': 0,
 '127': 0,
 '128': 0,
 '13': 0,
 '130': 0,
 '131743': 0,
 '132': 0,
 '13409': 0,
 '13580': 0,
 '1395': 0,
 '13s': 0,
 '14': 0,
 '140': 0,
 '14513': 0,
 '148': 0,
 '14af': 0,
 '15': 0,
 '150': 0,
 '151': 

In [68]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'00': 0,
 '00000000e': 0,
 '0002': 0,
 '00021': 0,
 '001': 0,
 '009s': 0,
 '01': 0,
 '02': 0,
 '020': 0,
 '028879': 0,
 '02d': 0,
 '03': 0,
 '0315': 0,
 '04': 0,
 '04d': 0,
 '05': 0,
 '051': 0,
 '054': 0,
 '06': 0,
 '06_spark_sql': 0,
 '07': 0,
 '07cd': 0,
 '08': 0,
 '09': 0,
 '0ms': 0,
 '0x3c947bc5': 0,
 '0x7efe331cf790': 0,
 '0x7f797010a590': 0,
 '0x7fbaf2666280': 0,
 '0x800701bc': 0,
 '0xa0': 0,
 '0xff': 0,
 '0zw04wdetqo': 0,
 '10': 0,
 '100': 0,
 '1000': 0,
 '100000': 0,
 '100k': 0,
 '100m': 0,
 '100mb': 0,
 '101': 0,
 '1010101': 0,
 '1049089': 0,
 '1053': 0,
 '107': 0,
 '1078': 0,
 '10gb': 0,
 '11': 0,
 '111': 0,
 '111111111': 0,
 '1111111111': 0,
 '1128': 0,
 '114': 0,
 '1186': 0,
 '12': 0,
 '120': 0,
 '121': 0,
 '122': 0,
 '1221': 0,
 '123': 0,
 '1234': 0,
 '124': 0,
 '126': 0,
 '127': 0,
 '128': 0,
 '13': 0,
 '130': 0,
 '131743': 0,
 '132': 0,
 '13409': 0,
 '13580': 0,
 '1395': 0,
 '13s': 0,
 '14': 0,
 '140': 0,
 '14513': 0,
 '148': 0,
 '14af': 0,
 '15': 0,
 '150': 0,
 '151': 

### Dot Product

- We use this to find the most similar document in relation to the query.

In [69]:
X.dot(q.T).todense()

matrix([[7],
        [0],
        [0],
        [2],
        [0],
        [0],
        [0],
        [2],
        [0],
        [0],
        [0],
        [2],
        [0],
        [0],
        [0],
        [2],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [3],
        [1],
        [0],
        [0],
        [0],
        [3],
        [2],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [2],
        [1],
        [2],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [2],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],

### Cosine Similarity

In [70]:
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
score = cosine_similarity(X, q)
score

array([[0.41537358],
       [0.        ],
       [0.        ],
       [0.21821789],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.19611614],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.18257419],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.2236068 ],
       [0.        ],
       [0.        ],
       [0.07624929],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.375     ],
       [0.11785113],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.28347335],
       [0.14002801],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.05076731],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.09128709],
       [0.04052204],
       [0.06350006],
       [0.07856742],
       [0.        ],
       [0.05773503],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [72]:
score = score.flatten()
score

array([0.41537358, 0.        , 0.        , 0.21821789, 0.        ,
       0.        , 0.        , 0.19611614, 0.        , 0.        ,
       0.        , 0.18257419, 0.        , 0.        , 0.        ,
       0.2236068 , 0.        , 0.        , 0.07624929, 0.        ,
       0.        , 0.        , 0.375     , 0.11785113, 0.        ,
       0.        , 0.        , 0.28347335, 0.14002801, 0.        ,
       0.        , 0.        , 0.        , 0.05076731, 0.        ,
       0.        , 0.        , 0.        , 0.09128709, 0.04052204,
       0.06350006, 0.07856742, 0.        , 0.05773503, 0.        ,
       0.        , 0.        , 0.        , 0.04079085, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.08703883, 0.        , 0.01155317,
       0.        , 0.        , 0.        , 0.        , 0.07715167,
       0.        , 0.        , 0.        , 0.        , 0.     

In [73]:
import numpy as np

In [76]:
argsort_ranking = np.argsort(score)
argsort_ranking

array([473, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 601, 612,
       614, 615, 616, 617, 618, 620, 621, 622, 624, 625, 613, 600, 597,
       596, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581,
       582, 583, 584, 585, 586, 589, 590, 591, 592, 594, 595, 627, 628,
       629, 631, 664, 665, 666, 667, 668, 670, 671, 673, 674, 675, 676,
       677, 678, 679, 680, 682, 683, 684, 686, 687, 688, 689, 690, 663,
       569, 662, 660, 632, 633, 634, 635, 636, 637, 638, 640, 641, 642,
       643, 644, 645, 646, 647, 649, 650, 651, 652, 653, 655, 658, 659,
       661, 567, 566, 564, 478, 479, 480, 481, 482, 483, 484, 486, 487,
       488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500,
       501, 477, 504, 476, 474, 433, 434, 437, 438, 441, 442, 443, 444,
       446, 447, 453, 459, 460, 461, 462, 463, 466, 467, 468, 469, 471,
       472, 946, 475, 691, 505, 507, 539, 540, 541, 542, 543, 544, 545,
       546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 55

In [77]:
argsort_ranking[-5:]

array([452, 764, 449,  22,   0])

In [79]:
df.iloc[22].text

"It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."

In [82]:
fields = ["section", "question", "text"]

In [98]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[f])

    vectorizers[f] = cv
    matrices[f] = X

In [99]:
matrices

{'section': <948x67 sparse matrix of type '<class 'numpy.float64'>'
 	with 3094 stored elements in Compressed Sparse Row format>,
 'question': <948x562 sparse matrix of type '<class 'numpy.float64'>'
 	with 4333 stored elements in Compressed Sparse Row format>,
 'text': <948x2118 sparse matrix of type '<class 'numpy.float64'>'
 	with 26463 stored elements in Compressed Sparse Row format>}

In [100]:
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [101]:
vectorizers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

### Score Accross All Fields

- Create an array with all zeros
- Loop over all the fields, for each field compute similarity
- Sum similarities accross all the fields

In [102]:
n = len(df)
n

948

In [155]:
score = np.zeros(n)

query = "I just singned up. Is it too late to join the course?"

# This simply give more importance to some fields over the others.
# in the example below, the question field has 3 times more importance than other fields.
# this is typically applied in elastic searches.
boosts = {
    "question": 3
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    # get boost value, if none return 1.0
    boost = boosts.get(f, 1.0)

    # convert to 1D array instead of 2D array
    f_score = cosine_similarity(X, q).flatten()
    score = score + (boost * f_score)

In [156]:
score

array([3.38295811, 3.49512426, 2.70735166, 1.68424985, 3.49512426,
       3.49512426, 1.26102286, 3.03149832, 2.67242848, 3.49512426,
       2.45169338, 1.43004364, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 1.68417129, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.77533273, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.981508  , 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [157]:
idx = np.argsort(score)[-5:]

In [158]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...


In [159]:
filters = {
    "course": "data-engineering-zoomcamp"
}

In [160]:
for field, value in filters.items():
    # create boolean mask: .values() method converts it into numpy array
    mask = (df[field] == value).astype(int).values
    score = score * mask
mask

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [161]:
score

array([3.38295811, 3.49512426, 2.70735166, 1.68424985, 3.49512426,
       3.49512426, 1.26102286, 3.03149832, 2.67242848, 3.49512426,
       2.45169338, 1.43004364, 0.49512426, 0.49512426, 0.49512426,
       0.73042693, 0.49512426, 1.68417129, 0.54107765, 0.49512426,
       0.49512426, 0.49512426, 0.72180425, 0.57465357, 0.49512426,
       0.49512426, 0.49512426, 0.68461965, 0.57823165, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.77533273, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.65996855, 0.51826746,
       0.5293658 , 1.981508  , 0.49512426, 0.52680125, 0.        ,
       0.        , 0.        , 0.        , 0.02097473, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04722243, 0.        , 0.0073737 ,
       0.        , 0.        , 0.        , 0.        , 0.04161211,
       0.        , 0.        , 0.        , 0.        , 0.     

In [162]:
# sort such that most important comes first
idx = np.argsort(-score)[:5]

In [163]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...


### Putting It All Together

In [164]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [165]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

### Embedding Search

- Vector search is useful when the words do not match exactly.
- [Singular value decomposition](https://en.wikipedia.org/wiki/Singular_value_decomposition), reducing the dimension of a matrix

In [166]:
X

<948x2118 sparse matrix of type '<class 'numpy.float64'>'
	with 26463 stored elements in Compressed Sparse Row format>

![Singular value decomposition image sample](https://camo.githubusercontent.com/44c87fdd35471759d83956ef7a33da45041d99c6737010a710dafea5e14157a3/687474703a2f2f686162726173746f726167652e6f72672f66696c65732f3835352f6136352f6336322f38353561363563363234646334313734623532366662356530336239383535352e706e67)

[Latent semantic analysis research paper](https://blog.marketmuse.com/glossary/latent-semantic-analysis-definition/#:~:text=Latent%20Semantic%20Analysis%20is%20a,relationships%20between%20terms%20and%20concepts.)

In [167]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

In [168]:
X

<948x2118 sparse matrix of type '<class 'numpy.float64'>'
	with 26463 stored elements in Compressed Sparse Row format>

### Apply Singular Value Decomposition

In [169]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.087999  , -0.07517913,  0.10138475,  0.05200296,  0.05259926,
       -0.0601142 ,  0.02348246,  0.03009545, -0.19619907,  0.33906322,
       -0.06247502,  0.07144709, -0.09002185, -0.07429137,  0.01727628,
       -0.04770202])

#### Shape has now been reduced down to 16 columsn

In [170]:
X_emb.shape

(948, 16)

In [171]:
X_emb[0]

array([ 0.087999  , -0.07517913,  0.10138475,  0.05200296,  0.05259926,
       -0.0601142 ,  0.02348246,  0.03009545, -0.19619907,  0.33906322,
       -0.06247502,  0.07144709, -0.09002185, -0.07429137,  0.01727628,
       -0.04770202])

### Embedding

The dense represenation as the one above is what we call embedding.

In reducing the dimension, the meaning is much preserved as much as possible. Example, words with similar meanings such as "enroll", "sign up", "register" are all reduced to a smaller vector space hence a relationship of meaning similarity is maintained.

The TFIDF vectorizer creates a sparse matrix, which we then reduce to embeddings.

### For quering

In [172]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353741, -0.03063155,  0.0446772 ,  0.01263105,  0.0261563 ,
       -0.05098121,  0.01354776,  0.0233487 , -0.11430612,  0.18385088,
       -0.04566398,  0.0595562 , -0.06186053, -0.01924867,  0.02838412,
       -0.03840126])

In [173]:
#  dot product
np.dot(X_emb[0], Q_emb[0])

0.11797466260642572

In [174]:
# get cosine similarity and turn into 1D array
score = cosine_similarity(X_emb, Q_emb).flatten()
# sort the scroes and get first 10
idx = np.argsort(-score)[:10]
# get corresponding data
list(df.loc[idx].text)

['If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]',
 'Yes, we will keep all the materials after the course finishes, so you can follo

### Problem With This Approach

The issue with this approach and way of compressing matrices is that, we get negative values. It's difficult to inteprete negative values. A solution to is can be **Non-Negative Matrix Factorization**

"NMF (Non-Negative Matrix Factorization) is a similar concept, except for non-negative input matrices it produces non-negative results.

We can interpret each of the columns (features) of the embeddings as different topic/concents and to what extent this document is about this concept." [source](https://github.com/alexeygrigorev/build-your-own-search-engine/)

In [176]:
from sklearn.decomposition import NMF

In [177]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.30496267,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

### for the query

In [179]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.0011455 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17069965,
       0.        , 0.        , 0.        , 0.00072061, 0.        ,
       0.        ])

In [180]:
np.dot(X_emb, Q_emb)

ValueError: shapes (948,16) and (1,16) not aligned: 16 (dim 1) != 1 (dim 0)

In [181]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ 

In [183]:
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
437,machine-learning-zoomcamp,General course-related questions,What if I miss a session?,"Everything is recorded, so you won’t miss anyt..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."


### Problem With All The Above Applied Methods

The issue with all the methods we have used so far, is that, the embeddings are all created from **Bag Of Word(BoW)** and in BoW we loose information on the order of the words and semantic relationship between words.

### Readings To Do

1. [Comprehensive Guide To Ranking](https://towardsdatascience.com/comprehensive-guide-to-ranking-evaluation-metrics-7d10382c1025)

### BERT

The issue with the previous two methods is that they ignore the order of words, treating each word independently. This is why it's referred to as "Bag-of-Words." BERT and other transformer models address this problem by considering word order. To create embeddings with BERT, we will use the Hugging Face library.

In [184]:
! pip -q install transformers tqdm

### Download Model And Tokenizer

- **Tokenizer:** Turns text into some numeric matrix representation aka **vectors**.
- **Model:** For compressing the text into embeddings

In [185]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### Using The Model

In [186]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [187]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [188]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [190]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    # this is the embedding itself
    hidden_states = outputs.last_hidden_state

In [191]:
hidden_states

tensor([[[ 1.0103e-01,  1.8106e-02,  1.3034e-01,  ..., -2.9319e-01,
           1.8632e-01,  6.6145e-01],
         [ 1.0608e+00, -1.2425e-01,  1.3701e-01,  ..., -1.6050e-01,
           1.0429e+00,  3.5325e-01],
         [ 1.8022e-01,  7.7588e-02,  3.9414e-01,  ..., -1.3787e-01,
           5.9744e-01,  1.7035e-01],
         ...,
         [ 4.7383e-01, -1.8445e-02,  2.1863e-01,  ..., -1.2885e-03,
          -8.3294e-02, -2.1699e-01],
         [ 6.5164e-01,  1.2163e-01, -2.4941e-01,  ...,  1.5567e-01,
          -5.6319e-01, -4.3100e-01],
         [ 7.1638e-01,  2.1572e-01, -2.8087e-02,  ...,  2.2812e-01,
          -6.7250e-01, -3.2448e-01]],

        [[ 3.1965e-01, -2.4620e-01,  1.9934e-01,  ..., -2.4255e-01,
          -1.0942e-01,  5.8847e-01],
         [-6.9823e-01, -7.5619e-01,  1.0645e-01,  ..., -1.1348e-01,
           4.5499e-01,  4.0241e-01],
         [ 4.1643e-01, -4.7885e-01,  3.2889e-01,  ..., -5.1462e-01,
           6.5775e-02,  6.9717e-01],
         ...,
         [ 1.0277e-01, -8

In [195]:
hidden_states[0]

tensor([[ 0.1010,  0.0181,  0.1303,  ..., -0.2932,  0.1863,  0.6615],
        [ 1.0608, -0.1242,  0.1370,  ..., -0.1605,  1.0429,  0.3532],
        [ 0.1802,  0.0776,  0.3941,  ..., -0.1379,  0.5974,  0.1704],
        ...,
        [ 0.4738, -0.0184,  0.2186,  ..., -0.0013, -0.0833, -0.2170],
        [ 0.6516,  0.1216, -0.2494,  ...,  0.1557, -0.5632, -0.4310],
        [ 0.7164,  0.2157, -0.0281,  ...,  0.2281, -0.6725, -0.3245]])

In [196]:
hidden_states[0].shape

torch.Size([15, 768])

#### Now we need to compress the embeddings

We do this by simply taking the sum on each row.

In [197]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

#### And convert them to a numpy array

In [198]:
X_emb = sentence_embeddings.numpy()

#### Note that if use a GPU, first you need to move your tensors to CPU

In [199]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [203]:
from tqdm import tqdm

In [204]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [205]:
texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

100%|████████████████████████████████████████| 119/119 [11:16<00:00,  5.69s/it]


### Putting It Together Into A Function

In [206]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [None]:
df["embeddings"] = compute_embeddings(df['text'].tolist())
df["embeddings"]

### Futher Reading

1. [Inverted Index In Information Retrieval](https://www.khoury.northeastern.edu/home/vip/teach/IRcourse/2_indexing_ngrams/slides/indexing_1.pdf)
2. [Posting List In Information Retrieval](https://nlp.stanford.edu/IR-book/html/htmledition/an-example-information-retrieval-problem-1.html#:~:text=Each%20item%20in%20the%20list,referred%20to%20as%20the%20postings%20.)
3. [Local Sensitivity Hashing Random Projection](https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing-random-projection/)