## Question 2

In [22]:
import io

import requests
import docx
import hashlib

from datetime import datetime
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [50]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [51]:
faq_documents = {
    # LLM Version 1
    'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
    # LLM Version 2
    #'llm-zoomcamp': '1T3MdwUvqCL3jrh3d3VCXQ8xE0UqRzI3bfgpfBq3ZWG0',
}

In [52]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

llm-zoomcamp


In [53]:
len(documents)

1

## Question 3

In [54]:
data = documents[0]

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

def chunking():
    documents = []
    
    for doc in data['documents']:
        doc['course'] = data['course']
        # previously we used just "id" for document ID
        doc['document_id'] = generate_document_id(doc)
        documents.append(doc)
    
    print(len(documents))
    
    return documents

documents = chunking()

86


## Question 4

In [55]:
es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name_prefix = "documents"
current_time = datetime.now().strftime("%Y%m%d_%M%S")
index_name = f"{index_name_prefix}_{current_time}"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents_20240817_3340'})

In [56]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 86/86 [00:02<00:00, 31.16it/s]


In [59]:
doc


{'text': 'Answer',
 'section': 'Workshops: X',
 'question': 'Question',
 'course': 'llm-zoomcamp',
 'document_id': 'd8c4c7bb'}

## Question 5 and 6

In [31]:
def elastic_search(query, course='llm-zoomcamp'):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "course": course
                #     }
                # }
            }
        }
    }
    response = es_client.search(index='documents_20240817_2343', body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [32]:
results = elastic_search(
    query="When is the next cohort?"
)
results

[{'text': 'Summer 2026.',
  'section': 'General course-related questions',
  'question': 'When is the next cohort?',
  'course': 'llm-zoomcamp',
  'document_id': 'b6fa77f3'},
 {'text': 'Cosine similarity is a measure used to calculate the similarity between two non-zero vectors, often used in text analysis to determine how similar two documents are based on their content. This metric computes the cosine of the angle between two vectors, which are typically word counts or TF-IDF values of the documents. The cosine similarity value ranges from -1 to 1, where 1 indicates that the vectors are identical, 0 indicates that the vectors are orthogonal (no similarity), and -1 represents completely opposite vectors.',
  'section': 'Module 3: X',
  'question': 'What is the cosine similarity?',
  'course': 'llm-zoomcamp',
  'document_id': 'ee355823'},
 {'text': 'The error indicates that you have not changed all instances of “employee_handbook” to “homework” in your pipeline settings',
  'section': 