In [145]:
#!curl https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [107]:
from bs4 import BeautifulSoup
import glob
import pandas as pd
import numpy as np
import re
import time
pd.set_option('display.max_colwidth', None)




## Step 1: Prepare documents (Extracting and Transforming)

In [106]:
# Load HTML files
def wrangle(paths):
    dfs=[]
    # forward fill after removed text from date column
    def clean_date(date):
                # Regular expression pattern for valid dates (e.g., '13 March 2019')
                pattern = r'^\d{1,2}\s\w+\s\d{4}$'  # Matches 'DD Month YYYY'
                if re.match(pattern, date):
                    return date
                else:
                    return None  # Replace invalid dates with None
    
    for html in paths:
        
        with open( html, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
    
            # Extract messages
            messages = []
            for message in soup.find_all('div', class_='message'):
                date = message.find('div', class_='body details').text.strip() if message.find('div', class_='body details') else None
                time = message.find('div', class_='date').text.strip() if message.find('div', class_='date') else None
                user = message.find('div', class_='from_name').text.strip() if message.find('div', class_='from_name') else None
                content = message.find('div', class_='text').text.strip() if message.find('div', class_='text') else None
                messages.append({"Date":date, "Time":time, "User": user,"Content": content})
            
            # Convert to DataFrame
            df = pd.DataFrame(messages)
            # fill in the na values by forward fill for dates
            df['Date']=df['Date'].ffill()
            # transform the date column by replacing any message about 'someone invited someone' in the group or removed someone with none
            df['Date']=df['Date'].apply(clean_date)
            df['Date']=df['Date'].ffill()
            # drop rows that have three nans. It is the date heading
            df=df.dropna(thresh=2)
            # forward fill user where nan
            df['User']=df['User'].ffill()
            # drop empty content it is usually pics
            df=df.dropna(subset='Content')
            dfs.append(df)
            print(df.head(1))
            
    return dfs

In [107]:
# file pattern
file_pattern=r'C:\Users\user\Documents\LLM Zoomcamp\Project\data\raw_data\messages*.html'

In [118]:
# Get a list of all matching files
files = glob.glob(file_pattern)  # we matched 438 files
len(files)
# for f in files:
#     print(f)

438

In [147]:
import time

t0 = time.time()
# Import html files using wrangle function, transform and combine them
#dfs=wrangle(files)
t1 = time.time()
total = t1-t0
# concat dataframes
#df=pd.concat(dfs, ignore_index=True)

df=df.dropna()
# Save to CSV
# df.to_csv('../data/clean_data/combo_conversations_latest.csv', index=False)
# df.isnull().sum()

In [148]:
print(total/60)

0.0


In [120]:
# import csv file
df = pd.read_csv('../data/clean_data/combo_conversations_latest.csv', parse_dates=['Date'])
#df.info()
df['Date']

0        2019-02-14
1        2019-02-14
2        2019-02-14
3        2019-02-14
4        2019-02-14
            ...    
414123   2021-04-28
414124   2021-04-28
414125   2021-04-28
414126   2021-04-28
414127   2021-04-28
Name: Date, Length: 414128, dtype: datetime64[ns]

In [122]:
df.head()

Unnamed: 0,Date,Time,User,Content
0,2019-02-14,08:02,Silmavi,Hello!
1,2019-02-14,08:05,Deleted Account,"sooo, detox finished when? üòÑ"
2,2019-02-14,08:05,Silmavi,Here we can talk about Aajonus?
3,2019-02-14,08:07,Deleted Account,Detox! Hell yeah Aajonus
4,2019-02-14,08:08,Silmavi,üòÇüòÇüòÇüòÇ


In [123]:
df.head(10)

Unnamed: 0,Date,Time,User,Content
0,2019-02-14,08:02,Silmavi,Hello!
1,2019-02-14,08:05,Deleted Account,"sooo, detox finished when? üòÑ"
2,2019-02-14,08:05,Silmavi,Here we can talk about Aajonus?
3,2019-02-14,08:07,Deleted Account,Detox! Hell yeah Aajonus
4,2019-02-14,08:08,Silmavi,üòÇüòÇüòÇüòÇ
5,2019-02-14,08:44,Deleted Account,"currently recovering from cheat beer and meals, face looks red burned and itchy, fasting for 48h+ and then slowly eating liver and yoghurt. its crazy how well im currently at reading my body needs"
6,2019-02-14,08:55,Deleted Account,what is aajonos saying about fasting?
7,2019-02-14,08:56,Deleted Account,Not to lol
8,2019-02-14,09:00,Deleted Account,5h and cell become cannibalistic?
9,2019-02-14,09:01,Deleted Account,I‚Äôve always found that funny


In [124]:
# Set the 'Date' column as the index of the DataFrame
df.set_index('Date', inplace=True)
# Sort the DataFrame by the index (Date)
df.sort_index(inplace=True)
df.tail()

Unnamed: 0_level_0,Time,User,Content
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-09-14,03:15,Mziskaci,are wild animals safe to eat raw?
2024-09-14,01:16,Adrian Marquelos,I saw this on eat raw meat channel. I wouldn‚Äôt pay heaps of money to them because I know there is very knowledgeable people in here like you and sara anyway . Is the abuser ur talking about the lina woman?
2024-09-14,00:31,Colbyüêç,You got vaxxed even after listening to aajonus for a couple years?
2024-09-14,08:00,Gabriel Olivares,"Is it possible for a detox to last 4 months or more, with symptoms such as fatigue, mental confusion, general malaise and daily aches and pains?"
2024-09-14,03:50,Marcel,"What to do with 10L of raw milk that are already a bit over 3 weeks old and quite fermented?(kept in the fridge, but fridge has fluctuating temperature)Anybody tried making cheese from old fermented milk?"


In [125]:
chunked_data = []
current_chunk = []
t0 = time.time()
for i in range(len(df)):
    if i == 0 or df.iloc[i]['User'] == df.iloc[i-1]['User']:
        current_chunk.append(df.iloc[i]['Content'])
    else:
        chunked_data.append(" ".join(current_chunk))
        current_chunk = [df.iloc[i]['Content']]

# Append the last chunk
if current_chunk:
    chunked_data.append(" ".join(current_chunk))
print(type(chunked_data))
# Create a new DataFrame with chunked data
chunked_df = pd.DataFrame({'Chunked_Content': chunked_data})

# Save the chunked data
chunked_df.to_csv('../data/clean_data/thread_chunked_conversations.csv', index=False)

t1 = time.time()
total = t1-t0
print(total/60)


<class 'list'>
1.9788364926973978


In [133]:
chunked_df=pd.read_csv('../data/clean_data/thread_chunked_conversations.csv')
chunked_df

Unnamed: 0,Chunked_Content
0,Hello! when you eat it alone? Good idea to eat it once wuth honey and once alone My kids eat it with the meal.They aren't at home btw meals I will do it during week end
1,"Yesterday I had nausea after several hours passed from second meal.Today I had zero hunger until 20h+ passed after previous that meal Today I made only one typical meal enhanced with 4 egg yolks So they ate clay? Act Charcoal will do the job? 4,5 h passed I feel neutral, zero hunger and body warmed itself to confort temperature. Before meal I felt colder than normal. Thx Ring"
2,So when you have enough fat you feel nausea if you eat it?
3,"So guys, water=bad?"
4,I have dry skin even if i drink twice lubrication formula
...,...
241858,are wild animals safe to eat raw?
241859,I saw this on eat raw meat channel. I wouldn‚Äôt pay heaps of money to them because I know there is very knowledgeable people in here like you and sara anyway . Is the abuser ur talking about the lina woman?
241860,You got vaxxed even after listening to aajonus for a couple years?
241861,"Is it possible for a detox to last 4 months or more, with symptoms such as fatigue, mental confusion, general malaise and daily aches and pains?"


In [129]:
# create new variables

chunked_data = []
current_chunk = []
question_found = False

for i in range(len(df)):
    message = df.iloc[i]['Content']
    
    # Check if the message is a question
    if message.endswith('?'):
        # If there's already a chunk being built (a question and its answers), finalize it
        if current_chunk:
            chunked_data.append(" ".join(current_chunk))
        # Start a new chunk with the current question
        current_chunk = [message]
        question_found = True
    else:
        # If the message isn't a question, but a question has been found, add it as an answer
        if question_found:
            current_chunk.append(message)
        # If no question has been found yet, skip this message or handle it as needed
        # (e.g., you could append it to the previous chunk if desired)

# Append the last chunk after the loop, if it exists
if current_chunk:
    chunked_data.append(" ".join(current_chunk))

# Create a new DataFrame with the chunked data
chunked_df_QandA = pd.DataFrame({'Chunked_Content': chunked_data})

# Save the chunked data to a CSV file
chunked_df_QandA.to_csv('../data/clean_data/question_answer_chunked_conversations.csv', index=False)


In [142]:
chunked_df_QandA['Chunked_Content'].split()


AttributeError: 'Series' object has no attribute 'split'

## Create Documents

In [91]:
# create documents
documents =chunked_df.to_dict(orient='records')

In [92]:
documents[0:20]

[{'Chunked_Content': "Hello! when you eat it alone? Good idea to eat it once wuth honey and once alone My kids eat it with the meal.They aren't at home btw meals I will do it during week end"},
 {'Chunked_Content': 'Yesterday I had nausea after several hours passed from second meal.Today I had zero hunger until 20h+ passed after previous that meal Today I made only one typical meal enhanced with 4 egg yolks So they ate clay? Act Charcoal will do the job? 4,5 h passed I feel neutral, zero hunger and body warmed itself to confort temperature. Before meal I felt colder than normal. Thx Ring'},
 {'Chunked_Content': 'So when you have enough fat you feel nausea if you eat it?'},
 {'Chunked_Content': 'So guys, water=bad?'},
 {'Chunked_Content': 'I have dry skin even if i drink twice lubrication formula'},
 {'Chunked_Content': "@ubeats1 could you please advice any source/book/literature about detoxification? Besides Aajonuse's books? Funny, just yesterday I committed myself to eating way more 

In [31]:
len(documents)

226272

In [32]:
import minsearch
import json

In [None]:
index = minsearch.Index(
    text_fields=["Chunked_Content"],
    keyword_fields=["User"]
)

In [37]:
index = minsearch.Index(
    text_fields=["Chunked_Content"],
    keyword_fields=[]
)

In [38]:
index

<minsearch.Index at 0x24e536842c0>

In [39]:
index.fit(documents)

<minsearch.Index at 0x24e536842c0>

In [40]:
def search(query):
    boost = {'Chunked_Content': 3.0}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

In [53]:
q = 'How should I eat beetroot?'

In [54]:
search(q)

[{'Chunked_Content': 'Kim your thoughts on beetroot-juice ?'},
 {'Chunked_Content': 'add beetroot juice to daily juice'},
 {'Chunked_Content': 'should I eat it'},
 {'Chunked_Content': 'You should eat it'},
 {'Chunked_Content': 'How much meat a day should i eat?'}]

## Step 2: Create Embeddings using Pretrained Models

In [55]:
# This is a new library compared to the previous modules. 
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [56]:
# test it
len(model.encode("This is a simple sentence"))  # 768
model.encode("This is a simple sentence")

array([ 4.44872770e-03, -7.61314631e-02, -3.77453602e-04,  7.52524380e-03,
       -3.80979627e-02,  3.80131193e-02, -9.73005779e-03, -5.05396398e-03,
       -9.37977899e-03,  1.23888236e-02,  4.91276681e-02,  1.52209969e-02,
        3.80008481e-02, -6.41802773e-02,  9.42129176e-03, -5.19748889e-02,
        9.08066854e-02,  1.71115436e-02,  1.62125509e-02,  2.98866015e-02,
        1.50537817e-03,  8.35078210e-03,  3.78842130e-02, -1.01192892e-02,
        6.46104896e-03,  3.97503209e-05, -1.45217460e-02, -1.88468937e-02,
       -3.74039337e-02, -1.51664275e-03, -1.02680037e-02, -3.68062854e-02,
        2.36677546e-02, -6.46023080e-02,  1.96967039e-06, -5.01106260e-03,
       -2.80828355e-03, -1.92073472e-02, -8.65120292e-02,  2.83464640e-02,
       -5.38667329e-02,  3.63706015e-02, -2.26467997e-02,  2.87367646e-02,
       -1.32342046e-02,  1.08689599e-01,  3.70518677e-02,  3.38802300e-02,
       -5.30679561e-02,  3.61782275e-02, -1.35723129e-03, -3.63482870e-02,
       -2.78346427e-02, -

In [58]:
# import tqdm
from tqdm.auto import tqdm
#created the dense vector using the pre-trained model
operations = []
t0 = time.time()
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["Chunked_Content"]).tolist()
    operations.append(doc)
t1 = time.time()
total=t1-t0
print(f'{total/60} minutes')

  0%|          | 0/226272 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
operations[0]

In [None]:
import pickle

# save dictionary to person_data.pkl file
with open('vectorized_data.pkl', 'wb') as fp:
    pickle.dump(operations, fp)
    print('dictionary saved successfully to byte file')

In [None]:
import json

print("Started writing dictionary to a file")
with open("vectorized_data.json", "w") as fp:
    json.dump(operations, fp)  # encode dict into JSON
print("Done writing dict into .txt file")

In [None]:
# Read dictionary pkl file
with open('person_data.pkl', 'rb') as fp:
    person = pickle.load(fp)
    print('Person dictionary')
    print(person)

In [None]:
# Open the file for reading
with open("person.txt", "r") as fp:
    # Load the dictionary from the file
    person_dict = json.load(fp)

# Print the contents of the dictionary
print(person_dict)

In [66]:
# Indexing with elastic search
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()




ObjectApiResponse({'name': '6b8eba599138', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dXWGvNjVS3KrouBEUVKMNw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [67]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Date": {"type": "text"},
            "User": {"type": "keyword"},
            "Content": {"type": "text"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [68]:
index_name = "diet-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'diet-questions'})

## Step 5: Add documents into index

In [70]:
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6800/6800 [08:21<00:00, 13.55it/s]


## Step 6: Create end user query

In [87]:
search_term = "is eating meat healthy?"
vector_search_term = model.encode(search_term)

In [88]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [89]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'diet-questions',
  '_id': 'Tk1Q_ZEB-BYxbY86ujWK',
  '_score': 0.8274733,
  '_source': {}},
 {'_index': 'diet-questions',
  '_id': 'M01L_ZEB-BYxbY86JSI8',
  '_score': 0.82315207,
  '_source': {}},
 {'_index': 'diet-questions',
  '_id': 'Xk1N_ZEB-BYxbY86zyuC',
  '_score': 0.8134525,
  '_source': {}},
 {'_index': 'diet-questions',
  '_id': '9U1S_ZEB-BYxbY86lDsM',
  '_score': 0.81185174,
  '_source': {}},
 {'_index': 'diet-questions',
  '_id': '401P_ZEB-BYxbY86tjHW',
  '_score': 0.8058151,
  '_source': {}}]

## Step 7: Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

In [90]:
# Note: I made a minor modification to the query shown in the notebook here
# (compare to the one shown in the video)
# Included "knn" in the search query (to perform a semantic search) along with the filter  
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [91]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"User": "Silmavi"},
    },
    knn=knn_query,
    size=5
)

In [92]:
response["hits"]["hits"]

[{'_index': 'diet-questions',
  '_id': 'y01L_ZEB-BYxbY86CCGL',
  '_score': 3.7038417,
  '_source': {'Date': '2019-02-24T00:00:00',
   'User': 'Silmavi',
   'Content': 'reallyüòÅü§®ü§®ü§®',
   'text_vector': [-0.010448154993355274,
    0.04629620537161827,
    0.007511643692851067,
    0.022471485659480095,
    0.032183099538087845,
    0.032937824726104736,
    -0.05606511980295181,
    0.04334476590156555,
    0.008313382975757122,
    0.0069734156131744385,
    0.03444882482290268,
    0.020371366292238235,
    -0.024321354925632477,
    0.02770829387009144,
    -0.021856702864170074,
    -0.015297445468604565,
    -0.024510452523827553,
    -0.0041328854858875275,
    0.028379207476973534,
    0.003877934068441391,
    0.035581305623054504,
    0.0342014916241169,
    -0.008496076799929142,
    -0.06302068382501602,
    -0.02258117124438286,
    -0.02990863472223282,
    -0.01822284609079361,
    -0.04249770939350128,
    0.06586158275604248,
    -0.03832273930311203,
    0.0016