# INSTALLATIONS

In [1]:
!pip install bertopic
!pip install keybert
!pip install nltk
!pip install ctransformers

Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━

# IMPORTS

In [2]:
import pandas as pd
import json
from datetime import time
from datetime import datetime
from bertopic import BERTopic
from keybert import KeyBERT
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# DATA LOADING

In [3]:
# Load the dataset from the JSON
file_path = 'data.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# DATA UNDERSTANDING

In [4]:
df.head()

Unnamed: 0,user_id,log_time,description
0,U010,2020-01-15 08:30:00,A plate of scrambled eggs and toast.
1,U008,2020-02-02 12:15:00,A bowl of vegetable stir-fry with quinoa.
2,U005,2020-03-10 18:45:00,A serving of grilled chicken with steamed broc...
3,U003,2020-04-05 13:00:00,A bowl of lentil soup with a side salad.
4,U009,2020-05-20 20:00:00,A plate of spaghetti with marinara sauce and m...


In [5]:
# Conducting an initial data exploration
dataset_characteristics = {
    "Data Shape": df.shape,
    "Data Types": df.dtypes,
    "Missing Values": df.isnull().sum(),
    "Duplicate Rows": df.duplicated().sum() #Is all rows duplicated
}

print(dataset_characteristics)

{'Data Shape': (724, 3), 'Data Types': user_id        object
log_time       object
description    object
dtype: object, 'Missing Values': user_id        0
log_time       0
description    0
dtype: int64, 'Duplicate Rows': 0}


In [6]:
# Finding duplicates in each column
duplicate_counts = {
    "duplicate_user_id_count": df['user_id'].duplicated().sum(),
    "duplicate_log_time_count": df['log_time'].duplicated().sum(),
    "duplicate_description_count": df['description'].duplicated().sum()
}

print(duplicate_counts)

{'duplicate_user_id_count': 714, 'duplicate_log_time_count': 10, 'duplicate_description_count': 409}


In [7]:
# Finding uniques in each column
unique_count = {
    "unique_user_ids_count": df['user_id'].nunique(),
    "unique_log_time_count": df['log_time'].nunique(),
    "unique_description_count": df['description'].nunique()
}

unique_count

{'unique_user_ids_count': 10,
 'unique_log_time_count': 714,
 'unique_description_count': 315}

In [8]:
# Getting the list of user_ids
unique_user_ids = df['user_id'].unique()
print(unique_user_ids)

['U010' 'U008' 'U005' 'U003' 'U009' 'U002' 'U006' 'U004' 'U007' 'U001']


In [9]:
# Finding the range of 'log_time'
log_time_min = df['log_time'].min()
log_time_max = df['log_time'].max()

print(log_time_min)
print(log_time_max)

2020-01-01 18:20:00
2020-12-31 10:45:00


# DATA CLEANING AND PREPARATION

In [10]:
# Convert the log_time to datetime format in the DataFrame
df['log_time'] = pd.to_datetime(df['log_time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   user_id      724 non-null    object        
 1   log_time     724 non-null    datetime64[ns]
 2   description  724 non-null    object        
dtypes: datetime64[ns](1), object(2)
memory usage: 17.1+ KB


Cleaning description from stopwords

In [11]:
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

print(stop_words)

{'now', 'themselves', 'don', 'doesn', 'these', 'other', 'few', 'any', 'couldn', 'if', "shan't", 'each', 'did', 'too', 'i', 'haven', 'by', 'on', 'when', "didn't", 'shouldn', 'but', 'have', 'shan', 'where', 'down', 'not', 'they', 're', 'after', 'about', 'theirs', 'am', 'its', 'being', 'those', 'very', 'of', 'hadn', 'yourselves', 'into', 'weren', 'won', 's', 'been', 'ourselves', 'ain', 'all', 'can', 'both', 'me', "wasn't", 'his', 'our', 'which', 'she', 'own', "couldn't", 'some', 't', 'mightn', 'yourself', 'an', 'he', 'and', 'further', 'just', 'only', 'isn', 'most', 'myself', 'up', 'this', 'do', 'from', 'ours', 'against', "shouldn't", 'itself', "aren't", "isn't", 'him', "hadn't", 'my', 'then', 'whom', 'how', "don't", 'such', "needn't", 'hers', "she's", 'between', "weren't", 'hasn', 'we', "wouldn't", 'should', 'does', 'again', 'her', 'm', 'why', 'as', 'nor', 'in', 'you', 'o', 'out', 'than', 'it', 'to', 'yours', 'herself', 'are', 'your', 'who', 'for', 'with', 'needn', 'because', "you're", 't

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


More cleaning steps: Converting to lowercase, removing punctualtions and special characters

In [12]:
df['description_cleaned'] = df['description'].str.lower()
df['description_cleaned'] = df['description_cleaned'].str.replace('[^\w\s]', '')
df['description_cleaned'] = df['description_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) #Joining words to form sentence and exclude stopwords

# FEATURE EXTRACTION

In [13]:
# Extracting the month
df['month'] = df['log_time'].dt.month

# Extracting the hour
df['hour'] = df['log_time'].dt.hour

# Extracting day of the week
df['day_of_week'] = df['log_time'].dt.day_name()

df.head()

Unnamed: 0,user_id,log_time,description,description_cleaned,month,hour,day_of_week
0,U010,2020-01-15 08:30:00,A plate of scrambled eggs and toast.,plate scrambled eggs toast,1,8,Wednesday
1,U008,2020-02-02 12:15:00,A bowl of vegetable stir-fry with quinoa.,bowl vegetable stirfry quinoa,2,12,Sunday
2,U005,2020-03-10 18:45:00,A serving of grilled chicken with steamed broc...,serving grilled chicken steamed broccoli,3,18,Tuesday
3,U003,2020-04-05 13:00:00,A bowl of lentil soup with a side salad.,bowl lentil soup side salad,4,13,Sunday
4,U009,2020-05-20 20:00:00,A plate of spaghetti with marinara sauce and m...,plate spaghetti marinara sauce meatballs,5,20,Wednesday


Categorizing the meals into Breakfast, Lunch, Dinner and Late Night Meal


*   Breakfast -> 5 to 10.59.59
*   Lunch -> 11 to 16.59.59
*   Dinner -> 17 to 23.59.59
*   Late Night Meal -> 00 to 04.59.59






In [14]:
def categorize_meal(log_time):
    meal_time = log_time.time() #extracting only time from datetime

    if datetime.strptime("05:00:00", "%H:%M:%S").time() <= meal_time < datetime.strptime("11:00:00", "%H:%M:%S").time():
        return "Breakfast"
    elif datetime.strptime("11:00:00", "%H:%M:%S").time() <= meal_time < datetime.strptime("17:00:00", "%H:%M:%S").time():
        return "Lunch"
    elif datetime.strptime("17:00:00", "%H:%M:%S").time() <= meal_time <= datetime.strptime("23:59:59", "%H:%M:%S").time():
        return "Dinner"
    else:
        return "Late Night Meal"

df['meal_type'] = df['log_time'].apply(categorize_meal)

df.head()

Unnamed: 0,user_id,log_time,description,description_cleaned,month,hour,day_of_week,meal_type
0,U010,2020-01-15 08:30:00,A plate of scrambled eggs and toast.,plate scrambled eggs toast,1,8,Wednesday,Breakfast
1,U008,2020-02-02 12:15:00,A bowl of vegetable stir-fry with quinoa.,bowl vegetable stirfry quinoa,2,12,Sunday,Lunch
2,U005,2020-03-10 18:45:00,A serving of grilled chicken with steamed broc...,serving grilled chicken steamed broccoli,3,18,Tuesday,Dinner
3,U003,2020-04-05 13:00:00,A bowl of lentil soup with a side salad.,bowl lentil soup side salad,4,13,Sunday,Lunch
4,U009,2020-05-20 20:00:00,A plate of spaghetti with marinara sauce and m...,plate spaghetti marinara sauce meatballs,5,20,Wednesday,Dinner


# Keyword Extraction with KeyBERT

KeyBERT is a Python Package. It is designed to take advantage of pre-trained language models (like BERT) for the task of keyword extraction.

In [15]:
keyBert_model = KeyBERT('bert-base-nli-mean-tokens') #paraphrase-MiniLM-L6-v2 -> this model is also can be used as it is fined tuned for paraphrased texts

.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Calling keybert model to extract keywords from cleaned descripton and saving to keywords column of dataframe.

In [16]:
def extract_keywords(text):
    keywords = keyBert_model.extract_keywords(text)
    return ', '.join([kw[0] for kw in keywords])

df['keywords'] = df['description_cleaned'].apply(extract_keywords)

df.head()

Unnamed: 0,user_id,log_time,description,description_cleaned,month,hour,day_of_week,meal_type,keywords
0,U010,2020-01-15 08:30:00,A plate of scrambled eggs and toast.,plate scrambled eggs toast,1,8,Wednesday,Breakfast,"eggs, toast, plate, scrambled"
1,U008,2020-02-02 12:15:00,A bowl of vegetable stir-fry with quinoa.,bowl vegetable stirfry quinoa,2,12,Sunday,Lunch,"vegetable, bowl, stirfry, quinoa"
2,U005,2020-03-10 18:45:00,A serving of grilled chicken with steamed broc...,serving grilled chicken steamed broccoli,3,18,Tuesday,Dinner,"chicken, broccoli, grilled, steamed, serving"
3,U003,2020-04-05 13:00:00,A bowl of lentil soup with a side salad.,bowl lentil soup side salad,4,13,Sunday,Lunch,"salad, soup, lentil, bowl"
4,U009,2020-05-20 20:00:00,A plate of spaghetti with marinara sauce and m...,plate spaghetti marinara sauce meatballs,5,20,Wednesday,Dinner,"meatballs, spaghetti, sauce, plate, marinara"


# Topic Modeling with BERTopic

BERTopic is a topic modeling technique and also take advantage of BERT. (Ref.LLM)

How BERTopic Works:


1.   Vectorization: BERTopic starts by converting text data into vector representations using a language model (like BERT). Each document is transformed into a high-dimensional vector that captures its semantic meaning.
2.   Dimensionality Reduction: These high-dimensional vectors are then reduced to a lower-dimensional space using techniques like UMAP (Uniform Manifold Approximation and Projection). This step helps in clustering similar documents more effectively.
3.   Clustering: The reduced vectors are clustered using a clustering algorithm such as HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise). This step groups documents into clusters based on their similarity.
4.   Topic Creation: For each cluster, BERTopic identifies the most representative words as the topic's keywords. These keywords help interpret what each topic is about.





In [17]:
topic_model = BERTopic(nr_topics=10)
topics, probabilities = topic_model.fit_transform(df['description'])
df['topic'] = topics # Saving topics in new column 'topic' in dataframe which is an integer

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [18]:
topic_model.get_topic_info() # Retrieve and display information about the topics identified by the BERTopic model

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,60,-1_with_of_and_serving,"[with, of, and, serving, shrimp, assorted, bow...","[A serving of shrimp scampi with linguine., A ..."
1,0,164,0_with_of_salad_and,"[with, of, salad, and, sandwich, fruits, bowl,...",[A bowl of mixed fruits set on a breakfast tra...
2,1,119,1_set_plate_fork_and,"[set, plate, fork, and, on, room, cutlery, gla...",[A plate and fork set on a living room coffee ...
3,2,102,2_tea_cup_herbal_enjoyed,"[tea, cup, herbal, enjoyed, during, of, chamom...",[A cup of herbal tea enjoyed during a relaxing...
4,3,88,3_kitchen_spoon_laid_bowl,"[kitchen, spoon, laid, bowl, out, and, cozy, o...",[A bowl and spoon laid out on a cozy kitchen t...
5,4,56,4_at_plate_desserts_family,"[at, plate, desserts, family, gathering, sandw...",[A plate of desserts served at a family gather...
6,5,45,5_coffee_cafe_black_enjoyed,"[coffee, cafe, black, enjoyed, city, cup, balc...",[A cup of black coffee enjoyed on a city balco...
7,6,44,6_soup_rainy_day_bowl,"[soup, rainy, day, bowl, dining, of, side, pla...",[A bowl of soup placed on a rainy day dining t...
8,7,25,7_hot_cocoa_evening_winter,"[hot, cocoa, evening, winter, cup, enjoyed, of...",[A cup of hot cocoa enjoyed on a winter evenin...
9,8,21,8_nuts_mixed_coffee_table,"[nuts, mixed, coffee, table, set, bowl, on, of...","[A bowl of mixed nuts set on a coffee table., ..."


Understanding output


*   Topic: The unique identifier (integer) for each topic found by the model. This identifier is used to reference specific topics.
*   Count: The number of documents in your dataset that have been assigned to each topic.
*   Representation: This refers to a set of words or phrases that best represent or characterize a topic.
*   Representative_Docs: This refers to a selection of documents that are most representative of a particular topic.





In [19]:
topics = topic_model.get_topics()
topics

{-1: [('with', 0.1590277589878165),
  ('of', 0.0922804008594126),
  ('and', 0.07332454576297073),
  ('serving', 0.0716710310597112),
  ('shrimp', 0.06272192505150143),
  ('assorted', 0.05842671319381456),
  ('bowl', 0.053694322737540316),
  ('vegetable', 0.04687881521410215),
  ('plate', 0.044639167325921575),
  ('eggs', 0.043856876615940364)],
 0: [('with', 0.12127481214258415),
  ('of', 0.07889948220289246),
  ('salad', 0.06614705442451946),
  ('and', 0.06549939661040073),
  ('sandwich', 0.0535153913513918),
  ('fruits', 0.0509670393822779),
  ('bowl', 0.0495869440122757),
  ('mixed', 0.04904007449195622),
  ('tray', 0.0478474128333175),
  ('breakfast', 0.04649659112667066)],
 1: [('set', 0.17093925238279445),
  ('plate', 0.16986833822027347),
  ('fork', 0.16364998988060553),
  ('and', 0.13299268657271623),
  ('on', 0.1262910802437426),
  ('room', 0.10960506626945252),
  ('cutlery', 0.09016981617532493),
  ('glass', 0.08612724226630282),
  ('restaurant', 0.0824316225990997),
  ('tabl

# Topic Label Generation Using a Large Language Model (Llama-2)

In [20]:
get_topic = topic_model.get_topic_info()

Initializing Llama-2 model

Here in this case we are using ctransformers to use Large Language Models on CPU instead of GPU (beacause of limited access to GPU resources)


*   ctransformers ->  Help me to load LLM from Hugging Face on CPU.
*   [TheBloke/Llama-2-7B-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) -> Pretrained model from Hugging Face



In [21]:
from ctransformers import AutoModelForCausalLM, AutoTokenizer

In [22]:
model = AutoModelForCausalLM.from_pretrained('TheBloke/Llama-2-7B-Chat-GGUF')

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

llama-2-7b-chat.Q2_K.gguf:   0%|          | 0.00/2.83G [00:00<?, ?B/s]

We are taking advantage of LLM to label each topic from already extracted and clustered topics.


*   We are passing documents i.e. set of description and keywords that will help LLM for labelling.
*   Effitient PROMPT ENGINEERING is adviced to extract output in required format.





In [23]:
df_topics = get_topic

prompt = """
    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant for labeling topics.
    <</SYS>>
    I have a topic that contains the following documents:
    <representative_docs>

    The topic is described by the following keywords: <representation>.

    Based on the information about the topic above, please create a short label describing the type of food of this topic. Make sure to only return the label and nothing more.
    [/INST]
    """

# Initialize an empty list to store labels
labels = []

# Loop through each row in the DataFrame
for index, row in df_topics.iterrows():
    print(row['Representative_Docs'])
    print(row['Representation'])

    # Convert the list of documents into a single string
    representative_docs_str = ', '.join(row['Representative_Docs'])

    # Convert the list of keywords into a single comma-separated string
    representation_str = ', '.join(row['Representation'])

    # Replace the placeholders in the template with the respective strings
    input = prompt.replace('<representative_docs>', representative_docs_str).replace('<representation>', representation_str)

    print(input)
    print(representative_docs_str)
    print(representation_str)
    results = model(input)
    label = results
    print(label)
    labels.append(label)

# Add the labels to the DataFrame
df_topics['label'] = labels


['A serving of shrimp scampi with linguine.', 'A bowl of shrimp and broccoli stir-fry with rice.', 'A serving of shrimp and vegetable stir-fry with brown rice.']
['with', 'of', 'and', 'serving', 'shrimp', 'assorted', 'bowl', 'vegetable', 'plate', 'eggs']

    <s>[INST] <<SYS>>
    You are a helpful, respectful and honest assistant for labeling topics.
    <</SYS>>
    I have a topic that contains the following documents:
    A serving of shrimp scampi with linguine., A bowl of shrimp and broccoli stir-fry with rice., A serving of shrimp and vegetable stir-fry with brown rice.

    The topic is described by the following keywords: with, of, and, serving, shrimp, assorted, bowl, vegetable, plate, eggs.

    Based on the information about the topic above, please create a short label describing the type of food of this topic. Make sure to only return the label and nothing more.
    [/INST]
    
A serving of shrimp scampi with linguine., A bowl of shrimp and broccoli stir-fry with rice., A 

Labels is assigned to each topic

In [24]:
df_topics

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,label
0,-1,60,-1_with_of_and_serving,"[with, of, and, serving, shrimp, assorted, bow...","[A serving of shrimp scampi with linguine., A ...",Food Label: Shrimp Stir-Fry
1,0,164,0_with_of_salad_and,"[with, of, salad, and, sandwich, fruits, bowl,...",[A bowl of mixed fruits set on a breakfast tra...,Fruit Salad
2,1,119,1_set_plate_fork_and,"[set, plate, fork, and, on, room, cutlery, gla...",[A plate and fork set on a living room coffee ...,Table setting with dinnerware and cutlery.
3,2,102,2_tea_cup_herbal_enjoyed,"[tea, cup, herbal, enjoyed, during, of, chamom...",[A cup of herbal tea enjoyed during a relaxing...,Tea
4,3,88,3_kitchen_spoon_laid_bowl,"[kitchen, spoon, laid, bowl, out, and, cozy, o...",[A bowl and spoon laid out on a cozy kitchen t...,Kitchen Table Foods
5,4,56,4_at_plate_desserts_family,"[at, plate, desserts, family, gathering, sandw...",[A plate of desserts served at a family gather...,Food Label: Desserts
6,5,45,5_coffee_cafe_black_enjoyed,"[coffee, cafe, black, enjoyed, city, cup, balc...",[A cup of black coffee enjoyed on a city balco...,Coffee
7,6,44,6_soup_rainy_day_bowl,"[soup, rainy, day, bowl, dining, of, side, pla...",[A bowl of soup placed on a rainy day dining t...,Food: Soup
8,7,25,7_hot_cocoa_evening_winter,"[hot, cocoa, evening, winter, cup, enjoyed, of...",[A cup of hot cocoa enjoyed on a winter evenin...,"Warming Hot Cocoa, Winter Evening Cups"
9,8,21,8_nuts_mixed_coffee_table,"[nuts, mixed, coffee, table, set, bowl, on, of...","[A bowl of mixed nuts set on a coffee table., ...",Snack food: Mixed nuts in a bowl


# Assigning Predefined Topic Labels

With the help of label and after manually going through each topic below labels are assigned to each topic and each corresponding row in dataframe.

In [25]:
import pandas as pd

# Define the labels for each topic
topic_labels = {
    -1: "Asian Cuisine",
    0: "Light Meals",
    2: "Beverages",
    5: "Beverages",
    7: "Beverages",
    4: "Desserts",
    6: "Soup",
    8: "Snacks",
    1: "Non Food",
    3: "Non Food"
}

# Add a new column for the label
df['Label'] = df['topic'].map(topic_labels)

# Save the updated DataFrame to a new file
df.to_excel('labels_topic_keywords_meal_data.xlsx', index=False)


# Dropping duplicate description and Saving Unique Descriptions

In [26]:
# Drop duplicate rows based on the 'Description' column
unique_df = df.drop_duplicates(subset=['description'])

# Save the unique rows to a new Excel file
output_file_path = 'unique_description_meal_data.xlsx'
unique_df.to_excel(output_file_path, index=False)


In [27]:
unique_description_meal_data = pd.read_excel('unique_description_meal_data.xlsx')

unique_description_meal_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 315 entries, 0 to 314
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              315 non-null    object        
 1   log_time             315 non-null    datetime64[ns]
 2   description          315 non-null    object        
 3   description_cleaned  315 non-null    object        
 4   month                315 non-null    int64         
 5   hour                 315 non-null    int64         
 6   day_of_week          315 non-null    object        
 7   meal_type            315 non-null    object        
 8   keywords             315 non-null    object        
 9   topic                315 non-null    int64         
 10  Label                315 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(7)
memory usage: 27.2+ KB


# Food Item Extraction from Descriptions Using LLM

We will be dropping duplicates from our dataset and feeding only unique descriptions to the Large Language Model (LLM) for the purpose of extracting food items. This approach will help in reducing redundancy in the data and minimize the computational resources required. By ensuring that only distinct descriptions are processed, we can enhance the efficiency of our model, both in terms of performance and the utilization of computational power.

We are choosing to pass the original 'description' instead of the 'cleaned description' to the Large Language Model (LLM) for extracting food items. This decision is based on the understanding that,
  1. The full context provided by the original description is crucial for the model to accurately discern the nature of the food items.
  2. Additionally, we are not passing just the keywords, as they may lead to incomplete or contextually inaccurate extractions. For instance, in the case of 'brown rice,' feeding only keywords might result in the model overlooking 'brown' as an integral part of the food item.
  3. By providing the full description, the model is more likely to correctly identify 'brown rice' as a distinct food item, capturing the complete context and specificity of the description.

In [29]:
prompt2 = """
[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
You are a knowledgeable and precise assistant tasked with extracting food items from a sentence.

Given a sentence from which you have to extract food items or fruits or anything that can be eaten : <description>
Please discern and list only the actual food items.
Exclude any descriptive words, adjectives, non-food terms or cutlery.
Also ignore words like 'grilled', 'stir-fry', 'baked' and words describing preparation.
Your response should be strictly in a single line, comma-separated format without bullet points or any additional explanations.
The final result should be like "Food items: item1, item2, ...."
[/INST]

"""

def extract_food_items(response):
    # Check if the response contains bullet points
    if '*' in response:
        # Extract items from the bullet-pointed list
        food_items = response.split('*')[1:]  # Split and ignore the first part ("Food items:")
        food_items = [item.strip() for item in food_items]  # Strip whitespace
    else:
        # Extract items from the comma-separated list
        start_phrase = "Food items: "
        start_idx = response.find(start_phrase)
        if start_idx != -1:
            food_items_str = response[start_idx + len(start_phrase):].strip()
            food_items = food_items_str.split(', ')
        else:
            food_items = []

    return food_items

# Initialize an empty column for food items in the DataFrame
unique_description_meal_data['food_items'] = ''

# Iterate over the DataFrame
for index, row in unique_description_meal_data.iterrows():
    # Replace placeholders with actual values
    model_input = prompt2.replace('<description>', row['description'])
    print(f"{index}: {row['description']}")
    # Get the response from the model
    response = model(model_input)

    # Convert the response to a string
    # Modify this line according to the actual structure of your response
    response_text = response.text if hasattr(response, 'text') else str(response)  # Update as per your model's response structure

    # Debugging: Print the response text
    print(response_text)

    food_items = extract_food_items(response_text)
    unique_description_meal_data.at[index, 'food_items'] = ', '.join(food_items)

# After processing all rows, save the updated DataFrame to a new Excel file
updated_excel_file = 'updated_keywords_meal_data_with_food_item .xlsx'
unique_description_meal_data.to_excel(updated_excel_file, index=False)


Assigning each food items to main dataframe based on description as we have food items in only dataframe having unique descriptions.
Also we are encorporating try catch block to catch any exception if occures.

In [30]:
try:
    updated_keywords_df = pd.read_excel('updated_keywords_meal_data_with_food_item.xlsx')
    meal_data_df = pd.read_excel('labels_topic_keywords_meal_data.xlsx')
except Exception as e:
    print(f"Error loading data: {e}")

# Convert 'description' column to lowercase for case-insensitive matching
updated_keywords_df['description'] = updated_keywords_df['description'].str.lower()
meal_data_df['description'] = meal_data_df['description'].str.lower()

# Creating a dictionary from the first dataset for lookup
description_to_food_items = dict(zip(updated_keywords_df['description'], updated_keywords_df['food_items']))

# Function to get food items based on description
def get_food_items(description):
    return description_to_food_items.get(description, '')

# Apply the function to the second dataset
try:
    meal_data_df['food_items'] = meal_data_df['description'].apply(get_food_items)
except Exception as e:
    print(f"Error during processing: {e}")

# Save the updated DataFrame
output_file_path = 'labels_topic_keywords_meal_data_with_food_items.xlsx'
try:
    meal_data_df.to_excel(output_file_path, index=False)
except Exception as e:
    print(f"Error saving data: {e}")

output_file_path

'labels_topic_keywords_meal_data_with_food_items.xlsx'

In [31]:
meal_data_df

Unnamed: 0,user_id,log_time,description,description_cleaned,month,hour,day_of_week,meal_type,keywords,topic,Label,food_items
0,U010,2020-01-15 08:30:00,a plate of scrambled eggs and toast.,plate scrambled eggs toast,1,8,Wednesday,Breakfast,"eggs, toast, plate, scrambled",-1,Asian Cuisine,"eggs, toast"
1,U008,2020-02-02 12:15:00,a bowl of vegetable stir-fry with quinoa.,bowl vegetable stirfry quinoa,2,12,Sunday,Lunch,"vegetable, bowl, stirfry, quinoa",-1,Asian Cuisine,"quinoa, vegetables"
2,U005,2020-03-10 18:45:00,a serving of grilled chicken with steamed broc...,serving grilled chicken steamed broccoli,3,18,Tuesday,Dinner,"chicken, broccoli, grilled, steamed, serving",-1,Asian Cuisine,"chicken, broccoli"
3,U003,2020-04-05 13:00:00,a bowl of lentil soup with a side salad.,bowl lentil soup side salad,4,13,Sunday,Lunch,"salad, soup, lentil, bowl",6,Soup,"lentils, soup, salad"
4,U009,2020-05-20 20:00:00,a plate of spaghetti with marinara sauce and m...,plate spaghetti marinara sauce meatballs,5,20,Wednesday,Dinner,"meatballs, spaghetti, sauce, plate, marinara",0,Light Meals,"spaghetti, marinara sauce, meatballs"
...,...,...,...,...,...,...,...,...,...,...,...,...
719,U002,2020-03-05 17:05:00,a bowl of pho with rare beef slices and fresh ...,bowl pho rare beef slices fresh herbs,3,17,Thursday,Dinner,"beef, herbs, fresh, slices, bowl",-1,Asian Cuisine,"rare beef slices, fresh herbs"
720,U003,2020-03-06 13:00:00,a slice of lemon meringue pie with a buttery c...,slice lemon meringue pie buttery crust,3,13,Friday,Lunch,"buttery, pie, meringue, lemon, crust",0,Light Meals,"lemon, meringue, crust"
721,U009,2020-03-07 08:55:00,"a bagel with avocado, tomato, and a sprinkle o...",bagel avocado tomato sprinkle everything bagel...,3,8,Saturday,Breakfast,"tomato, bagel, avocado, sprinkle, seasoning",0,Light Meals,"Bagel, avocado, tomato"
722,U005,2020-03-08 18:50:00,a plate of shrimp scampi with linguine pasta.,plate shrimp scampi linguine pasta,3,18,Sunday,Dinner,"pasta, shrimp, plate, scampi, linguine",-1,Asian Cuisine,"Shrimp, pasta, linguine"


After visually inspecting labels with food items it is noticed that LLM is assigning food items where labels is Non Food.
1. So decided to feed rows where label is Non Food to another Hugging Face pretrained model [carolanderson/roberta-base-food-ner](https://huggingface.co/carolanderson/roberta-base-food-ner?text=a+serving+of+baked+ziti+with+marinara+and+melted+mozzarella.) which is train to use to extract food items from sentences.
2. But this particular model is not performing good in extracting tea from sentences.
3. Thats why we are using this model to clean only Non Food items rows.
4. Apart from tea it is performing similar or good compared to LLM and as well it is efficient.

In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = 'carolanderson/roberta-base-food-ner'

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

# Load your dataset
df = pd.read_excel('labels_topic_keywords_meal_data_with_food_items.xlsx')

non_food_label = 'Non Food'

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Check if the label is 'Non Food'
    if row['Label'] == non_food_label:
        # Apply the NER model to the description
        ner_results = nlp(row['description'], aggregation_strategy="first")

        # Initialize an empty list to store food items for this row
        food_items_for_row = []

        # Iterate over the NER results
        for result in ner_results:
            # Check if the entity is labeled as FOOD
            if result['entity_group'] == 'FOOD':
                # If it is, append the word to the food items list for this row
                food_items_for_row.append(result['word'])

        # Update the 'food_items' column with the extracted food items for this row
        df.at[index, 'food_items'] = ', '.join(food_items_for_row)

# Save the results back to Excel
df.to_excel('labels_topic_keywords_meal_data_with_food_items.xlsx', index=False)


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/896 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [33]:
df

Unnamed: 0,user_id,log_time,description,description_cleaned,month,hour,day_of_week,meal_type,keywords,topic,Label,food_items
0,U010,2020-01-15 08:30:00,a plate of scrambled eggs and toast.,plate scrambled eggs toast,1,8,Wednesday,Breakfast,"eggs, toast, plate, scrambled",-1,Asian Cuisine,"eggs, toast"
1,U008,2020-02-02 12:15:00,a bowl of vegetable stir-fry with quinoa.,bowl vegetable stirfry quinoa,2,12,Sunday,Lunch,"vegetable, bowl, stirfry, quinoa",-1,Asian Cuisine,"quinoa, vegetables"
2,U005,2020-03-10 18:45:00,a serving of grilled chicken with steamed broc...,serving grilled chicken steamed broccoli,3,18,Tuesday,Dinner,"chicken, broccoli, grilled, steamed, serving",-1,Asian Cuisine,"chicken, broccoli"
3,U003,2020-04-05 13:00:00,a bowl of lentil soup with a side salad.,bowl lentil soup side salad,4,13,Sunday,Lunch,"salad, soup, lentil, bowl",6,Soup,"lentils, soup, salad"
4,U009,2020-05-20 20:00:00,a plate of spaghetti with marinara sauce and m...,plate spaghetti marinara sauce meatballs,5,20,Wednesday,Dinner,"meatballs, spaghetti, sauce, plate, marinara",0,Light Meals,"spaghetti, marinara sauce, meatballs"
...,...,...,...,...,...,...,...,...,...,...,...,...
719,U002,2020-03-05 17:05:00,a bowl of pho with rare beef slices and fresh ...,bowl pho rare beef slices fresh herbs,3,17,Thursday,Dinner,"beef, herbs, fresh, slices, bowl",-1,Asian Cuisine,"rare beef slices, fresh herbs"
720,U003,2020-03-06 13:00:00,a slice of lemon meringue pie with a buttery c...,slice lemon meringue pie buttery crust,3,13,Friday,Lunch,"buttery, pie, meringue, lemon, crust",0,Light Meals,"lemon, meringue, crust"
721,U009,2020-03-07 08:55:00,"a bagel with avocado, tomato, and a sprinkle o...",bagel avocado tomato sprinkle everything bagel...,3,8,Saturday,Breakfast,"tomato, bagel, avocado, sprinkle, seasoning",0,Light Meals,"Bagel, avocado, tomato"
722,U005,2020-03-08 18:50:00,a plate of shrimp scampi with linguine pasta.,plate shrimp scampi linguine pasta,3,18,Sunday,Dinner,"pasta, shrimp, plate, scampi, linguine",-1,Asian Cuisine,"Shrimp, pasta, linguine"


# Loading data into DWH

DWH can be connected to visualization tools like Tableau to extract user's meal-logging behavior?

In [35]:
!pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-8.2.0-cp310-cp310-manylinux_2_17_x86_64.whl (31.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.6/31.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<=4.21.12,>=4.21.1 (from mysql-connector-python)
  Downloading protobuf-4.21.12-cp37-abi3-manylinux2014_x86_64.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.8/409.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf, mysql-connector-python
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you ha

In [36]:
import mysql.connector

In [37]:
# Database connection function
def get_db_connection():
    return mysql.connector.connect(
        host="localhost",
        user="root",
        password="root",
        database="oviva"
    )

In [41]:
insert_query = """
        INSERT INTO oviva ('Column Names ')
        VALUES (%s)
    """

In [40]:
try:
  db_connection = get_db_connection()
  cursor = db_connection.cursor()
  print("Executing insert query...")
  cursor.execute(insert_query, ('define column names'))
  db_connection.commit()
  print("Query executed successfully.")
except mysql.connector.Error as err:
  print(f"Error occurred: {err}")
finally:
  if cursor:
    cursor.close()
  if db_connection:
    db_connection.close()

Next Steps


1.   Hyperparameter Tuning for Topic Modelling. (e.g. for UMAP and HDBSCAN)
2.   Feed custom parameters in LLM model for better accuracy or revising promt.



To handle the analysis of 100,000 users logging meals over a year, here's what we can do:

1. Improve Data Loading: Make sure our system can load lots of data quickly and efficiently. We might need to break the data into smaller chunks.

2. Speed Up Topic Modeling and Keyword Extraction: Make these processes faster and more efficient, by using classifier algorithms to classify it according to our predefined labels. We can use advanced tools like Apache Spark, which are designed to handle big data.

3. Better Data Preparation: Get our data ready in a way that handles all sorts of input from users, even if some of it is messy or different from what you expect. Do data preparation and cleaning as before moving forward with modelling or analysis.

4. Use Strong Database Systems: We can choose databases that can store a lot of information and let you access it quickly. This is important as your data will keep growing and you'll need to access it often. Use cloud based datalakes for quick extraction.