In [13]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import BertModel, BertTokenizer

In [2]:
# Read data

df = pd.read_csv('documents.csv')
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [3]:
df.shape

(948, 4)

In [4]:
df = df[df['course'] == 'data-engineering-zoomcamp']
df.shape

(435, 4)

In [5]:
fields = ['section', 'question', 'text']
vectorizers = {}
matrices = {}

for field in fields:
    v = TfidfVectorizer(stop_words='english', min_df=3)
    X = v.fit_transform(df[field])

    vectorizers[field] = v
    matrices[field] = X

In [6]:
matrices

{'section': <435x37 sparse matrix of type '<class 'numpy.float64'>'
 	with 1565 stored elements in Compressed Sparse Row format>,
 'question': <435x285 sparse matrix of type '<class 'numpy.float64'>'
 	with 2113 stored elements in Compressed Sparse Row format>,
 'text': <435x1177 sparse matrix of type '<class 'numpy.float64'>'
 	with 12020 stored elements in Compressed Sparse Row format>}

In [7]:
vectorizers['text'].get_feature_names_out()

array(['01', '02', '03', ..., 'zones', 'zoom', 'zoomcamp'], dtype=object)

In [8]:
vectorizers['section'].get_feature_names_out()

array(['analytics', 'column', 'course', 'cpp_type', 'data', 'dbt',
       'dlthub', 'docker', 'does', 'dolocationid', 'double',
       'engineering', 'error', 'external_fhv_tripdata', 'general',
       'int64', 'kafka', 'match', 'message', 'module', 'orchestration',
       'parquet', 'project', 'pyspark', 'questions', 'reading', 'related',
       'risingwave', 'streaming', 'table', 'target', 'terraform',
       'trips_data_all', 'type', 'warehousing', 'workflow', 'workshop'],
      dtype=object)

In [9]:
query = "I just signed up. Is it too late to join the course?"

q = vectorizers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [10]:
indx = np.argsort(-score)[:5]
df.iloc[indx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
22,data-engineering-zoomcamp,General course-related questions,Environment - Do we really have to use GitHub ...,It's up to you which platform and environment ...
27,data-engineering-zoomcamp,General course-related questions,Environment - The GCP and other cloud provider...,You can do most of the course without a cloud....
287,data-engineering-zoomcamp,Module 4: analytics engineering with dbt,CREATE TABLE has columns with duplicate name l...,This error could result if you are using some ...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...


In [11]:
# Boosting the question field

boost = {'question': 3.0}

score = np.zeros(len(df))

for field in fields:
    b = boost.get(field, 1.0)
    q = vectorizers[field].transform([query])
    s = cosine_similarity(matrices[field], q).flatten()
    score = score + b * s

In [12]:
idx = np.argsort(-score)[:5]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course