In [195]:
print(pd.__version__)

2.2.2


In [1]:
from bs4 import BeautifulSoup
import glob
import pandas as pd
import numpy as np
import re



## Step 1: Prepare documents

In [375]:
# Load HTML files
def wrangle(paths):
    dfs=[]
    # forward fill after removed text from date column
    def clean_date(date):
                # Regular expression pattern for valid dates (e.g., '13 March 2019')
                pattern = r'^\d{1,2}\s\w+\s\d{4}$'  # Matches 'DD Month YYYY'
                if re.match(pattern, date):
                    return date
                else:
                    return None  # Replace invalid dates with None
    
    for html in paths:
        
        with open( html, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
    
            # Extract messages
            messages = []
            for message in soup.find_all('div', class_='message'):
                date = message.find('div', class_='body details').text.strip() if message.find('div', class_='body details') else None
                time = message.find('div', class_='date').text.strip() if message.find('div', class_='date') else None
                user = message.find('div', class_='from_name').text.strip() if message.find('div', class_='from_name') else None
                content = message.find('div', class_='text').text.strip() if message.find('div', class_='text') else None
                messages.append({"Date":date, "Time":time, "User": user,"Content": content})
            
            # Convert to DataFrame
            df = pd.DataFrame(messages)
            # fill in the na values by forward fill for dates
            df['Date']=df['Date'].ffill()
            # transform the date column by replacing any message about 'someone invited someone' in the group or removed someone with none
            df['Date']=df['Date'].apply(clean_date)
            df['Date']=df['Date'].ffill()
            # drop rows that have three nans. It is the date heading
            df=df.dropna(thresh=2)
            # forward fill user where nan
            df['User']=df['User'].ffill()
            # drop empty content it is usually pics
            df=df.dropna(subset='Content')
            dfs.append(df)
            
    return dfs

In [376]:
# file pattern
file_pattern=r'C:\Users\user\Documents\LLM Zoomcamp\Project\data\raw_data\messages*.html'

In [377]:
# Get a list of all matching files
files = glob.glob(file_pattern)  # we matched 438 files
len(files)

438

In [383]:
import time

t0 = time.time()
# Import html files using wrangle function, transform and combine them
#dfs=wrangle(files)
t1 = time.time()
total = t1-t0
# concat dataframes
#df=pd.concat(dfs, ignore_index=True)
# Save to CSV
#df.to_csv('../data/clean_data/combo_telegram_conversations.csv', index=False)

In [4]:
# import csv file
df = pd.read_csv('../data/clean_data/combo_telegram_conversations.csv')

In [19]:
# removed rows with nulls
df.info()
df.head()
df=df.dropna()
df.isnull().sum()
# df.to_csv('../data/clean_data/combo_telegram_conversations.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 414128 entries, 0 to 414136
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Date     414128 non-null  object
 1   Time     414128 non-null  object
 2   User     414128 non-null  object
 3   Content  414128 non-null  object
dtypes: object(4)
memory usage: 15.8+ MB


Date       0
Time       0
User       0
Content    0
dtype: int64

In [31]:
df['Date']=pd.to_datetime(df['Date'])

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 414128 entries, 0 to 414136
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   Date     414128 non-null  datetime64[ns]
 1   Time     414128 non-null  object        
 2   User     414128 non-null  object        
 3   Content  414128 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 15.8+ MB


In [41]:
import pandas as pd

# Load your dataset
df = pd.read_csv('../data/clean_data/combo_telegram_conversations.csv', parse_dates=['Date'])


# Sample 100 rows from each month
sampled_df = df.groupby(df['Date'].dt.to_period("M")).apply(lambda x: x.sample(n=100, random_state=1)).reset_index(drop=True)

# Save the sampled data
sampled_df.to_csv('../data/clean_data/temporally_sampled_dataset.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6800 entries, 0 to 6799
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     6800 non-null   datetime64[ns]
 1   Time     6800 non-null   object        
 2   User     6800 non-null   object        
 3   Content  6800 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 212.6+ KB


In [55]:
df=pd.read_csv('../data/clean_data/temporally_sampled_dataset.csv', parse_dates=['Date'])
df=df.drop(columns='Time')

## preprocessing

In [56]:
import re
import string

def remove_emojis(text):
    """
    Strip emojis from text by matching all non-characters in the Unicode 'So' (Symbol, Other) category.
    """
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # emoticons
                               "\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "\U0001F680-\U0001F6FF"  # transport & map symbols
                               "\U0001F700-\U0001F77F"  # alchemical symbols
                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               "\U0001FA00-\U0001FA6F"  # Chess Symbols
                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               "\U00002702-\U000027B0"  # Dingbats
                               "\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def preprocess_text(text, remove_emojis_flag=True):
    """
    Preprocess text by lowercasing, removing punctuation, and optionally removing emojis.
    """
    # Optionally remove emojis
    if remove_emojis_flag:
        text = remove_emojis(text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Example conversation data
conversation_texts = [
    "Hello 😊! How are you doing today?",
    "I'm doing well, thanks! Just finished reading an article on AI. 🤖",
    "That's great! 😃 I've been working on my  1Python skills lately."
]

# Preprocess each conversation entry
preprocessed_texts = [preprocess_text(text) for text in conversation_texts]
print(preprocessed_texts)


['hello how are you doing today', 'im doing well thanks just finished reading an article on ai', 'thats great ive been working on my 1python skills lately']


In [57]:
# preprocess dataframe
df['Content']=df['Content'].apply(lambda text:preprocess_text(text, remove_emojis_flag=False))
# create documents
documents =df.to_dict(orient='records')

In [58]:
documents[0:5]

[{'Date': Timestamp('2019-02-24 00:00:00'),
  'User': 'Silmavi',
  'Content': 'really😁🤨🤨🤨'},
 {'Date': Timestamp('2019-02-25 00:00:00'),
  'User': 'Yan',
  'Content': 'vamoooos'},
 {'Date': Timestamp('2019-02-17 00:00:00'),
  'User': 'Yan',
  'Content': 'that its not the holy grail'},
 {'Date': Timestamp('2019-02-21 00:00:00'),
  'User': 'Deleted Account',
  'Content': 'i want to ensure good physical developement when i still can'},
 {'Date': Timestamp('2019-02-17 00:00:00'),
  'User': 'Deleted Account',
  'Content': 'that’s what it’s made for growth'}]

In [59]:
len(documents)

6800

In [60]:
documents[0]

{'Date': Timestamp('2019-02-24 00:00:00'),
 'User': 'Silmavi',
 'Content': 'really😁🤨🤨🤨'}

## Step 2: Create Embeddings using Pretrained Models

In [61]:
# This is a new library compared to the previous modules. 
# Please perform "pip install sentence_transformers==2.7.0"
from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work
model = SentenceTransformer("all-mpnet-base-v2")



In [62]:
# test it
len(model.encode("This is a simple sentence"))  # 768
model.encode("This is a simple sentence")

array([ 4.44872770e-03, -7.61314631e-02, -3.77453602e-04,  7.52524380e-03,
       -3.80979627e-02,  3.80131193e-02, -9.73005779e-03, -5.05396398e-03,
       -9.37977899e-03,  1.23888236e-02,  4.91276681e-02,  1.52209969e-02,
        3.80008481e-02, -6.41802773e-02,  9.42129176e-03, -5.19748889e-02,
        9.08066854e-02,  1.71115436e-02,  1.62125509e-02,  2.98866015e-02,
        1.50537817e-03,  8.35078210e-03,  3.78842130e-02, -1.01192892e-02,
        6.46104896e-03,  3.97503209e-05, -1.45217460e-02, -1.88468937e-02,
       -3.74039337e-02, -1.51664275e-03, -1.02680037e-02, -3.68062854e-02,
        2.36677546e-02, -6.46023080e-02,  1.96967039e-06, -5.01106260e-03,
       -2.80828355e-03, -1.92073472e-02, -8.65120292e-02,  2.83464640e-02,
       -5.38667329e-02,  3.63706015e-02, -2.26467997e-02,  2.87367646e-02,
       -1.32342046e-02,  1.08689599e-01,  3.70518677e-02,  3.38802300e-02,
       -5.30679561e-02,  3.61782275e-02, -1.35723129e-03, -3.63482870e-02,
       -2.78346427e-02, -

In [63]:
# import tqdm
from tqdm.auto import tqdm
#created the dense vector using the pre-trained model
operations = []
t0 = time.time()
for doc in tqdm(documents):
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["Content"]).tolist()
    operations.append(doc)
t1 = time.time()
total=t1-t0
print(f'{total/60} minutes')

In [65]:
operations[0]

{'Date': Timestamp('2019-02-24 00:00:00'),
 'User': 'Silmavi',
 'Content': 'really😁🤨🤨🤨',
 'text_vector': [-0.010448154993355274,
  0.04629620537161827,
  0.007511643692851067,
  0.022471485659480095,
  0.032183099538087845,
  0.032937824726104736,
  -0.05606511980295181,
  0.04334476590156555,
  0.008313382975757122,
  0.0069734156131744385,
  0.03444882482290268,
  0.020371366292238235,
  -0.024321354925632477,
  0.02770829387009144,
  -0.021856702864170074,
  -0.015297445468604565,
  -0.024510452523827553,
  -0.0041328854858875275,
  0.028379207476973534,
  0.003877934068441391,
  0.035581305623054504,
  0.0342014916241169,
  -0.008496076799929142,
  -0.06302068382501602,
  -0.02258117124438286,
  -0.02990863472223282,
  -0.01822284609079361,
  -0.04249770939350128,
  0.06586158275604248,
  -0.03832273930311203,
  0.0016958988271653652,
  -0.03205510973930359,
  -0.05483444035053253,
  0.008350278250873089,
  2.018640770984348e-06,
  -0.025898952037096024,
  -0.044389866292476654,
  

In [66]:
# Indexing with elastic search
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()




ObjectApiResponse({'name': '6b8eba599138', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dXWGvNjVS3KrouBEUVKMNw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [67]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Date": {"type": "text"},
            "User": {"type": "keyword"},
            "Content": {"type": "text"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [68]:
index_name = "diet-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'diet-questions'})

In [None]:
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

 71%|██████████████████████████████████████████████████████████████████████████▎                              | 4812/6800 [05:59<02:33, 12.95it/s]