This command installs the BERTopic library, which is a topic modeling technique that leverages BERT embeddings and c-TF-IDF to create easily interpretable topics.


In [None]:
# Install the BERTopic library using pip
pip install bertopic

Description:
This command installs the Faker library using pip, Python's package installer.

Faker: A Python library that generates fake data for various purposes such as testing, development, and data population.
Use cases: Creating mock databases, testing data pipelines, generating realistic-looking test data.

In [2]:
pip install faker 

Collecting faker
  Downloading Faker-26.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-26.0.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-26.0.0
Note: you may need to restart the kernel to use updated packages.


Generate Synthetic Data

In [3]:
import random
import pandas as pd
from faker import Faker

fake = Faker()

# Constants
num_records = 40000
num_events = 50
num_pages = 20
num_paths = 50

# Defining realistic events and paths for a pet e-commerce website
events = [
    'page_view', 'add_to_cart', 'remove_from_cart', 'purchase', 'search', 
    'login', 'logout', 'sign_up', 'wishlist_add', 'wishlist_remove', 
    'product_click', 'category_view', 'checkout_start', 'checkout_complete', 
    'payment_fail', 'review_submit', 'rating_submit', 'contact_us', 
    'newsletter_signup', 'account_update', 'password_reset', 'order_cancel', 
    'return_initiate', 'subscription_start', 'subscription_cancel', 
    'chat_support', 'product_compare', 'coupon_apply', 'coupon_remove', 
    'review_helpful', 'review_not_helpful', 'store_locator', 'gift_card_purchase', 
    'gift_card_redeem', 'track_order', 'faq_view', 'terms_view', 
    'privacy_policy_view', 'shipping_info_view', 'size_guide_view'
]

paths = [
    '/home', '/shop', '/shop/dogs', '/shop/cats', '/shop/birds', 
    '/shop/fish', '/shop/small-pets', '/shop/reptiles', '/shop/sale', 
    '/shop/new-arrivals', '/product/dog-food', '/product/cat-toy', 
    '/product/bird-cage', '/product/fish-tank', '/product/rabbit-hutch', 
    '/product/lizard-terrarium', '/cart', '/checkout', '/account', 
    '/account/orders', '/account/wishlist', '/account/subscription', 
    '/contact', '/faq', '/store-locator', '/gift-cards', '/terms', 
    '/privacy', '/shipping-info', '/size-guide'
]

# Generating a variety of query parameters
query_parameters = [
    'search_term=dog+food', 'search_term=cat+toy', 'search_term=bird+cage', 
    'filter=price_high_to_low', 'filter=price_low_to_high', 'filter=new_arrivals', 
    'category=dogs', 'category=cats', 'category=birds', 'category=fish', 
    'sort=popularity', 'sort=rating', 'sort=newest', 'product_id=12345', 
    'product_id=67890', 'product_id=54321', 'product_id=09876', 'coupon_code=SUMMER21', 
    'coupon_code=WINTER21', 'referral=1234abcd', 'referral=5678efgh', 'gift_card_code=GC12345', 
    'gift_card_code=GC67890', 'shipping_method=standard', 'shipping_method=express',
    'color=red', 'color=blue', 'size=small', 'size=medium', 'size=large', 
    'brand=brandA', 'brand=brandB', 'discount=true', 'discount=false', 
    'availability=in_stock', 'availability=out_of_stock', 'rating=5', 'rating=4', 
    'reviewed=true', 'reviewed=false', 'payment_method=credit_card', 'payment_method=paypal'
]

# Generating the data with the new query parameters
data = {
    'event': [random.choice(events) for _ in range(num_records)],
    'context_page_title': [fake.sentence(nb_words=4) for _ in range(num_records)],
    'timestamp': [fake.date_time_this_year() for _ in range(num_records)],
    'user_id': [fake.uuid4() for _ in range(num_records)],
    'website_path': [random.choice(paths) for _ in range(num_records)],
    'website_query_parameters': ['&'.join(random.sample(query_parameters, k=random.randint(1, 5))) for _ in range(num_records)],
    'session_id': [fake.uuid4() for _ in range(num_records)]
}




In [6]:
pd.DataFrame(data).to_csv('session-data-info.csv', index = None)

In [4]:
# Merge relevant columns into a single 'merged_information' column
# Replace NaN values with empty strings before concatenation
df['merged_information'] = (
    df['event'].fillna('') +
    df['context_page_title'].fillna('') +
    df['website_path'].fillna('') +
    df['website_query_parameters'].fillna('')
)

# Create a new DataFrame with only session_id and merged_information
session_information = df[['session_id', 'merged_information']]

# Group the data by session_id and concatenate all merged_information for each session
session_information = (
    session_information.groupby('session_id')['merged_information']
    .apply(",".join)
    .reset_index()
)

# The resulting session_information DataFrame now contains:
# - session_id: unique identifier for each session
# - merged_information: a comma-separated string of all events and information for that session

In [6]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer 

# Initialize the CountVectorizer
vectorizer_model = CountVectorizer(min_df=3, ngram_range=(1,3)) 

# Initialize the BERTopic model
model = BERTopic(
   vectorizer_model=vectorizer_model, 
   language='english', 
   embedding_model='all-MiniLM-L12-v2', 
   verbose=True, 
)

# Fit the model and transform the data
topics, probs = model.fit_transform(session_information["merged_information"])

# Visualize the topics
model.visualize_topics()

2024-07-21 09:57:56.860927: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 09:57:56.861165: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 09:57:57.039743: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [27]:
from sklearn.cluster import KMeans 
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer 
from bertopic import BERTopic

# Initialize the CountVectorizer
vectorizer_model = CountVectorizer(min_df=3, ngram_range=(1,3))

# Initialize KMeans clustering model
cluster_model = KMeans(n_clusters=13)

# Initialize SentenceTransformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for the merged_information
embeddings = sentence_model.encode(session_information["merged_information"])

# Initialize BERTopic model
model = BERTopic(
   vectorizer_model=vectorizer_model, 
   language='english', 
   verbose=True, 
   hdbscan_model=cluster_model
)

# Fit the model and transform the data
topics, probs = model.fit_transform(session_information["merged_information"], embeddings=embeddings)

# Visualize the topics
model.visualize_topics()

2024-07-21 10:46:59,204 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-21 10:47:30,704 - BERTopic - Dimensionality - Completed ✓
2024-07-21 10:47:30,707 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-21 10:47:31,228 - BERTopic - Cluster - Completed ✓
2024-07-21 10:47:31,242 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-21 10:47:32,433 - BERTopic - Representation - Completed ✓


In [28]:
model.visualize_heatmap()

In [12]:
from umap import UMAP

# Perform dimensionality reduction on the embeddings
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Visualize the documents using the reduced embeddings
model.visualize_documents(session_information["merged_information"], reduced_embeddings=reduced_embeddings)

In [16]:
model.visualize_documents(session_information["merged_information"], embeddings=embeddings)

In [29]:
topic_information = model.get_topic_info()

In [22]:
import os
import google.generativeai as genai
import json 
import time 

# Set the Google Gemini API key as an environment variable
os.environ['GOOGLE_GEMINI_API_KEY'] = 'YOUR_API_KEY' 

# Configure the genai library with the API key
genai.configure(api_key=os.environ['GOOGLE_GEMINI_API_KEY'])

In [59]:
topic_information

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,topic_label,description
0,0,7139,0_fact_prevent_spend_program,"[fact, prevent, spend, program, without, sort,...",[track_orderRepublican Mrs at risk small./faqs...,Shopping & Sorting,This topic focuses on online shopping experien...
1,1,6555,1_dog_cell_election_ago,"[dog, cell, election, ago, oil, industry, leas...",[gift_card_redeemStand oil agent able church./...,E-commerce Sorting,This topic focuses on sorting and filtering op...
2,2,4707,2_speech_pay_usually_imagine,"[speech, pay, usually, imagine, by, little, th...",[track_orderPlace speech whole book point./ter...,Shopping & Sorting,This topic focuses on online shopping and prod...
3,3,4305,3_student_long_glass_camera,"[student, long, glass, camera, statement, find...",[review_not_helpfulSince let true today skin./...,Website Navigation,This topic focuses on website features related...
4,4,3561,4_glass_whatever_sea_page,"[glass, whatever, sea, page, fish, summer, esp...",[contact_usLess future action scientist home./...,E-commerce Sorting,This topic focuses on the sorting and filterin...
5,5,2403,5_wish_american_word_court,"[wish, american, word, court, speak, bar, rath...",[wishlist_addDifference American wife througho...,Website Navigation,This topic focuses on website navigation featu...
6,6,2316,6_home_occur_develop_politics,"[home, occur, develop, politics, partner, many...",[return_initiateInto would read coach among./s...,E-commerce Sorting,This topic focuses on sorting and filtering op...
7,7,1949,7_see_sense_high_visit,"[see, sense, high, visit, ready, party, street...",[coupon_removePage hear back stuff find./accou...,Online Shopping Features,This topic focuses on various features commonl...
8,8,1869,8_yourself_stop_own_degree,"[yourself, stop, own, degree, process, amount,...",[order_cancelPlant three hold arm oil./shopcat...,Website Navigation,This topic focuses on website navigation featu...
9,9,1684,9_environment_part_others_leave,"[environment, part, others, leave, during, smi...",[product_clickSeem during part recent./product...,Website Navigation,This topic focuses on website features related...


In [47]:
MODEL_ID = "gemini-1.5-flash"  
genai_model = genai.GenerativeModel(model_name=MODEL_ID, generation_config={"temperature":0.3}) 

prompt = """
Given a set of representative documents: {documents}
And a list of keywords describing the topic: {keywords}
Please generate:
1. A concise topic label (2-3 words)
2. A brief description (1-2 sentences)
Return the results in JSON format as follows:
{{
   "topic_label": "Your concise label here",
   "description": "Your brief description here"
}}
"""

In [41]:
# Get the topic representation and representative documents
topic_representation = topic_information.iloc[0]['Representation']
topic_representative_docs = topic_information.iloc[0]['Representative_Docs'] 

# Generate content using the Gemini model
response = genai_model.generate_content(
   prompt.format(
       documents=topic_representative_docs, 
       keywords=topic_representation
   ), 
   generation_config={'response_mime_type':'application/json'}
)

# Parse the JSON response
parsed_response = json.loads(response.text)

{'topic_label': 'Policy & Sorting',
 'description': 'This topic covers viewing and understanding policies, potentially related to data or terms. It also involves sorting and filtering products or information based on various criteria like price, popularity, and shipping options.'}

In [43]:
# Function to generate topic descriptions using the Gemini AI model
def generate_topic_descriptions(documents, keywords):
    # Call the Gemini AI model to generate content
    response = genai_model.generate_content(
        # Format the prompt template with the provided documents and keywords
        prompt.format(documents=documents, keywords=keywords),
        # Specify that the response should be in JSON format
        generation_config={'response_mime_type':'application/json'}
    )
    # Parse the JSON response and return it as a Python dictionary
    return json.loads(response.text)

# Initialize an empty list to store the generated topic descriptions
topic_descriptions = []

# Iterate over each row in the topic_information DataFrame
# tqdm is used to display a progress bar
for row in tqdm(topic_information.itertuples()):
    # Extract the keywords (topic representation) for the current topic
    keywords = row.Representation
    
    # Extract the representative documents for the current topic
    documents = row.Representative_Docs
    
    # Generate a description for the current topic using the AI model
    # and append the result to the topic_descriptions list
    topic_descriptions.append(generate_topic_descriptions(documents, keywords))
    
    # Pause for 2 seconds to implement basic rate limiting
    # This helps avoid overwhelming the API with too many requests in a short time
    time.sleep(2)

In [51]:
topic_information['topic_label'] = [item['topic_label'] for item in topic_descriptions]
topic_information['description'] = [item['description'] for item in topic_descriptions]

In [60]:
topic_descriptions

[{'topic_label': 'Shopping & Sorting',
  'description': 'This topic focuses on online shopping experiences, particularly the use of sorting and filtering options to refine product searches.'},
 {'topic_label': 'E-commerce Sorting',
  'description': 'This topic focuses on sorting and filtering options available on e-commerce websites, including parameters like price, popularity, and newest arrivals.'},
 {'topic_label': 'Shopping & Sorting',
  'description': 'This topic focuses on online shopping and product sorting, analyzing user interactions with features like sorting options, filters, and coupon codes.'},
 {'topic_label': 'Website Navigation',
  'description': 'This topic focuses on website features related to sorting, filtering, and other navigation elements, often used for online shopping or information retrieval.'},
 {'topic_label': 'E-commerce Sorting',
  'description': 'This topic focuses on the sorting and filtering options available on e-commerce websites, allowing users to re

In [55]:
document_information = model.get_document_info(session_information["merged_information"])

In [58]:
document_information = document_information.merge(topic_information[['Topic','topic_label','description']], on='Topic')

In [61]:
document_information.head()

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Representative_document,topic_label,description
0,loginQuality true throw./shop/new-arrivalscolo...,0,0_fact_prevent_spend_program,"[fact, prevent, spend, program, without, sort,...",[track_orderRepublican Mrs at risk small./faqs...,fact - prevent - spend - program - without - s...,False,Shopping & Sorting,This topic focuses on online shopping experien...
1,return_initiateParticularly city story./shop/c...,2,2_speech_pay_usually_imagine,"[speech, pay, usually, imagine, by, little, th...",[track_orderPlace speech whole book point./ter...,speech - pay - usually - imagine - by - little...,False,Shopping & Sorting,This topic focuses on online shopping and prod...
2,checkout_startCover whom lay./accountsort=popu...,0,0_fact_prevent_spend_program,"[fact, prevent, spend, program, without, sort,...",[track_orderRepublican Mrs at risk small./faqs...,fact - prevent - spend - program - without - s...,False,Shopping & Sorting,This topic focuses on online shopping experien...
3,store_locatorProperty continue interest cost./...,1,1_dog_cell_election_ago,"[dog, cell, election, ago, oil, industry, leas...",[gift_card_redeemStand oil agent able church./...,dog - cell - election - ago - oil - industry -...,False,E-commerce Sorting,This topic focuses on sorting and filtering op...
4,page_viewArrive area./account/subscriptiondisc...,0,0_fact_prevent_spend_program,"[fact, prevent, spend, program, without, sort,...",[track_orderRepublican Mrs at risk small./faqs...,fact - prevent - spend - program - without - s...,False,Shopping & Sorting,This topic focuses on online shopping experien...
