In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [6]:
df = df[df['course'] == 'data-engineering-zoomcamp']
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


In [7]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(docs_example)

In [None]:
vocab = vectorizer.get_feature_names_out()
vocab

array(['catalog', 'cloud', 'course', 'details', 'end', 'google',
       'homework', 'january', 'listed', 'month', 'prerequisites',
       'python', 'register', 'setup', 'submit'], dtype=object)

In [None]:
X = vectorizer.transform(docs_example)
X.toarray()

array([[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]])

In [14]:
df_docs = pd.DataFrame(X.toarray(), columns=vocab)
df_docs

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0
1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0
2,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1
3,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0
4,0,1,1,0,0,1,0,1,0,0,0,1,0,1,0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(docs_example)

In [16]:
vocab = vectorizer.get_feature_names_out()
vocab

array(['catalog', 'cloud', 'course', 'details', 'end', 'google',
       'homework', 'january', 'listed', 'month', 'prerequisites',
       'python', 'register', 'setup', 'submit'], dtype=object)

In [17]:
X = vectorizer.transform(docs_example)
X.toarray()

array([[0.        , 0.        , 0.32842678, 0.68924048, 0.        ,
        0.        , 0.        , 0.32842678, 0.        , 0.        ,
        0.        , 0.        , 0.55607488, 0.        , 0.        ],
       [0.56750154, 0.        , 0.27041752, 0.        , 0.        ,
        0.        , 0.        , 0.27041752, 0.56750154, 0.        ,
        0.45785667, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.22578084, 0.        , 0.47382645,
        0.        , 0.47382645, 0.22578084, 0.        , 0.47382645,
        0.        , 0.        , 0.        , 0.        , 0.47382645],
       [0.        , 0.        , 0.35959372, 0.        , 0.        ,
        0.        , 0.        , 0.35959372, 0.        , 0.        ,
        0.6088451 , 0.        , 0.6088451 , 0.        , 0.        ],
       [0.        , 0.47382645, 0.22578084, 0.        , 0.        ,
        0.47382645, 0.        , 0.22578084, 0.        , 0.        ,
        0.        , 0.47382645, 0.        , 

In [18]:
df_docs = pd.DataFrame(X.toarray(), columns=vocab)
df_docs

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0.0,0.0,0.328427,0.68924,0.0,0.0,0.0,0.328427,0.0,0.0,0.0,0.0,0.556075,0.0,0.0
1,0.567502,0.0,0.270418,0.0,0.0,0.0,0.0,0.270418,0.567502,0.0,0.457857,0.0,0.0,0.0,0.0
2,0.0,0.0,0.225781,0.0,0.473826,0.0,0.473826,0.225781,0.0,0.473826,0.0,0.0,0.0,0.0,0.473826
3,0.0,0.0,0.359594,0.0,0.0,0.0,0.0,0.359594,0.0,0.0,0.608845,0.0,0.608845,0.0,0.0
4,0.0,0.473826,0.225781,0.0,0.0,0.473826,0.0,0.225781,0.0,0.0,0.0,0.473826,0.0,0.473826,0.0
