## 1. Library Installation

In [None]:
!pip install -q bertopic

# 2. Reading Dataset

In [None]:
from bertopic import BERTopic
import pandas as pd
import time
import os

base_dir = '../dataset/phase 3/'
file_path = os.path.join(base_dir, 'topic_modelling_dataset.csv')

df = pd.read_csv(file_path)

# 3. Topic Modelling

In [None]:
import time
from bertopic import BERTopic

def run_topic_modeling(df, column, min_topic_size, min_nr_topics=None):
    """
    Perform topic modeling on a specified column in a pandas DataFrame using BERTopic.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the textual data.
    - column (str): The name of the column in `df` that contains the text to be analyzed.
    - min_topic_size (int): The minimum size of the topics.
    - min_nr_topics (int, optional): The minimum number of topics to model. If None, the number of topics
      will be determined based on the data and `min_topic_size`.

    Returns:
    - tuple: A tuple containing the following elements:
        - topic_info (pandas.DataFrame): A DataFrame with information about the topics found in the data.
        - df (pandas.DataFrame): The original DataFrame updated with a 'cluster' column indicating the topic
          assignment for each document.

    The function also prints the time taken for the topic modeling process.
    """
    min_topic_size = min_topic_size
    embedding_model = "all-MiniLM-L6-v2"

    if min_nr_topics:
      topic_model = BERTopic(nr_topics=min_nr_topics, embedding_model=embedding_model, verbose=True)
    else:
      topic_model = BERTopic(min_topic_size=min_topic_size, embedding_model=embedding_model, verbose=True)

    start_time = time.time()
    df[column] = df[column].astype(str)
    topics, probabilities = topic_model.fit_transform(df[column])
    end_time = time.time()

    time_taken = end_time - start_time
    print(f"Time taken for training: {time_taken} seconds")

    topic_info = topic_model.get_topic_info()
    df['cluster'] = topics

    return topic_info, df


In [None]:
topic_info, result = run_topic_modeling(df, 'content_short', 100)

In [None]:
topic_info

In [None]:
result

In [None]:
save_dir = '../dataset/phase 4/'
file_path_1 = os.path.join(save_dir, 'topic_info.xlsx')
file_path_2 = os.path.join(save_dir, 'raw_topic_modelling_result.xlsx')

topic_info.to_excel(file_path_1, index=False)
result.to_excel(file_path_2, index=False)

## 3. Combine Similar Topics

In [None]:
import pandas as pd

base_dir = '../dataset/phase 4/'
file_path = os.path.join(base_dir, 'raw_topic_modelling_result.xlsx')

result = pd.read_excel(file_path)

In [None]:
# Define clusters and their corresponding topics
clusters = {
    1: [0, 3, 7, 30, 41, 42, 43, 49, 50],
    2: [27, 28, 32],
    3: [1, 2, 13, 14, 22, 35, 48],
    4: [4, 6, 9, 12, 19, 26, 37, 53],
    5: [11],
    6: [15, 36],
    7: [24],
    8: [16],
    9: [18],
    10: [20, 44],
    11: [33, 46],
    12: [23],
    13: [34],
    14: [40],
    15: [38],
    16: [8],
    17: [39],
    18: [10],
    19: [5],
    20: [17],
    21: [21, 25, 52],
    22: [51]
}

# Generate a mapping dictionary from topics to clusters
topic_to_cluster = {topic: cluster for cluster, topics in clusters.items() for topic in topics}

result['cluster'] = result['cluster'].map(lambda x: topic_to_cluster.get(x, 0))
result

# 4. Refining Cluster

## 4.1. Cluster 1

In [None]:
subset_df = result[result.cluster == 1]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_1 = result_subset_df[result_subset_df.cluster.isin([4, 7, 8, 10, 12, 14, 15, 16, 18, 28])].reset_index(drop = True)
df_1['cluster'] = 1
df_1

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_1.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_1.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_1.to_excel(file_path_2, index=False)

## 4.2 Cluster 2

In [None]:
subset_df = result[result.cluster == 2]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representation:
  print(rep)

In [None]:
df_2 = result_subset_df[result_subset_df.cluster.isin([-1, 0])].reset_index(drop = True)
df_2['cluster'] = 2
df_2

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_2.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_2.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_2.to_excel(file_path_2, index=False)

## 4.3. Cluster 3

In [None]:
subset_df = result[result.cluster == 3]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

In [None]:
df_3 = result_subset_df.reset_index(drop = True)
df_3['cluster'] = 3
df_3

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_3.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_3.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_3.to_excel(file_path_2, index=False)

## 4.4. Cluster 4

In [None]:
subset_df = result[result.cluster == 4]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

In [None]:
df_4 = result_subset_df.reset_index(drop = True)
df_4['cluster'] = 4
df_4

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_4.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_4.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_4.to_excel(file_path_2, index=False)

## 4.5. Cluster 5

In [None]:
subset_df = result[result.cluster == 5]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

In [None]:
df_5 = result_subset_df.reset_index(drop = True)
df_5['cluster'] = 5
df_5

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_5.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_5.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_5.to_excel(file_path_2, index=False)

## 4.6. Cluster 6

In [None]:
subset_df = result[result.cluster == 6]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

In [None]:
df_6 = result_subset_df.reset_index(drop = True)
df_6['cluster'] = 6
df_6

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_6.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_6.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_6.to_excel(file_path_2, index=False)

## 4.7. Cluster 7

In [None]:
subset_df = result[result.cluster == 7]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

In [None]:
df_7 = result_subset_df.reset_index(drop = True)
df_7['cluster'] = 7
df_7

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_7.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_7.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_7.to_excel(file_path_2, index=False)

## 4.8. Cluster 8

In [None]:
subset_df = result[result.cluster == 8]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

In [None]:
df_8 = result_subset_df[result_subset_df.cluster.isin([-1, 1])].reset_index(drop = True)
df_8['cluster'] = 8
df_8

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_8.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_8.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_8.to_excel(file_path_2, index=False)

## 4.9. Cluster 9

In [None]:
subset_df = result[result.cluster == 9]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_9 = result_subset_df.reset_index(drop = True)
df_9['cluster'] = 9
df_9

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_9.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_9.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_9.to_excel(file_path_2, index=False)

## 4.10. Cluster 10

In [None]:
subset_df = result[result.cluster == 10]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

In [None]:
df_10 = result_subset_df.reset_index(drop = True)
df_10['cluster'] = 10
df_10

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_10.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_10.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_10.to_excel(file_path_2, index=False)

## 4.11. Cluster 11

In [None]:
subset_df = result[result.cluster == 11]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

In [None]:
df_11 = result_subset_df.reset_index(drop = True)
df_11['cluster'] = 11
df_11

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_11.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_11.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_11.to_excel(file_path_2, index=False)

## 4.12. Cluster 12

In [None]:
subset_df = result[result.cluster == 12]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_12 = result_subset_df.reset_index(drop = True)
df_12['cluster'] = 12
df_12

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_12.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_12.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_12.to_excel(file_path_2, index=False)

## 4.13. Cluster 13

In [None]:
subset_df = result[result.cluster == 13]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_13 = result_subset_df.reset_index(drop = True)
df_13['cluster'] = 13
df_13

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_13.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_13.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_13.to_excel(file_path_2, index=False)

## 4.14. Cluster 14

In [None]:
subset_df = result[result.cluster == 14]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_14 = result_subset_df.reset_index(drop = True)
df_14['cluster'] = 14
df_14

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_14.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_14.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_14.to_excel(file_path_2, index=False)

## 4.15. Cluster 15

In [None]:
subset_df = result[result.cluster == 15]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_15 = result_subset_df.reset_index(drop = True)
df_15['cluster'] = 15
df_15

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_15.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_15.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_15.to_excel(file_path_2, index=False)

## 4.16. Cluster 16

In [None]:
subset_df = result[result.cluster == 16]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

In [None]:
df_16 = result_subset_df.reset_index(drop = True)
df_16['cluster'] = 16
df_16

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_16.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_16.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_16.to_excel(file_path_2, index=False)

## 4.17. Cluster 17

In [None]:
subset_df = result[result.cluster == 17]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_17 = result_subset_df.reset_index(drop = True)
df_17['cluster'] = 17
df_17

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_17.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_17.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_17.to_excel(file_path_2, index=False)

## 4.18. Cluster 18

In [None]:
subset_df = result[result.cluster == 18]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_18 = result_subset_df.reset_index(drop = True)
df_18['cluster'] = 18
df_18

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_18.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_18.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_18.to_excel(file_path_2, index=False)

## 4.19. Cluster 19

In [None]:
subset_df = result[result.cluster == 19]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
subset_df = result_subset_df[result_subset_df.cluster.isin([0])].reset_index(drop = True)
topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_19 = result_subset_df[result_subset_df.cluster.isin([1])].reset_index(drop = True)
df_19['cluster'] = 19
df_19

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_19.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_19.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_19.to_excel(file_path_2, index=False)

## 4.20. Cluster 20

In [None]:
subset_df = result[result.cluster == 20]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

In [None]:
df_20 = result_subset_df.reset_index(drop = True)
df_20['cluster'] = 20
df_20

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_20.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_20.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_20.to_excel(file_path_2, index=False)

## 4.21. Cluster 21

In [None]:
subset_df = result[result.cluster == 21]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

In [None]:
df_21 = result_subset_df.reset_index(drop = True)
df_21['cluster'] = 21
df_21

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_21.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_21.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_21.to_excel(file_path_2, index=False)

## 4.22. Cluster 22

In [None]:
subset_df = result[result.cluster == 22]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

In [None]:
topic_info

In [None]:
df_22 = result_subset_df.reset_index(drop = True)
df_22['cluster'] = 22
df_22

In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_22.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_22.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_22.to_excel(file_path_2, index=False)

# 5. Combine All Refined Cluster Dataset

In [None]:
import pandas as pd

# Base path for the files
base_path = '../dataset/phase 4/partition/'

# Generate file paths dynamically for IDs from 1 to 17
file_paths = [f'{base_path}result_cluster_{i}.xlsx' for i in range(1, 23)]

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the file paths, read each file as a DataFrame, and append it to the list
for file_path in file_paths:
    df = pd.read_excel(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into one final DataFrame
df_combined = pd.concat(dfs, ignore_index=True)


In [None]:
df_combined

# 6. Sentence Embedding

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import time

# Load the "all-MiniLM-L6-v2" model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Assuming df['review'] contains your text data
reviews = df_combined['content'].tolist()

# Define batch size
batch_size = 500

# Placeholder for accumulated embeddings
all_embeddings = []

# Process data in batches
for i in range(0, len(reviews), batch_size):
    batch = reviews[i:i+batch_size]
    start_time = time.time()
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    end_time = time.time()

    # Append the embeddings of the current batch
    all_embeddings.append(batch_embeddings)

    print(f"Batch {i//batch_size + 1} processed in {end_time - start_time:.2f} seconds.")

# Concatenate all batch embeddings into a single array
all_embeddings = np.vstack(all_embeddings)

# Save the embeddings to a .npy file
save_dir = '../dataset/phase 4/review_embeddings.npy'
np.save(save_dir, all_embeddings)


In [None]:
import numpy as np

base_dir = '../dataset/phase 4/review_embeddings.npy'
all_embeddings = np.load(base_dir)
all_embeddings[0:5]

# 7. Distance Calculation

In [None]:
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

# Initialize the 'distance' column to NaN or 0 to ensure it's there
df_combined['distance_to_its_centroid'] = np.nan

for cluster_num in range(1, 23):  #
    # Find indices for rows in 'result' where 'cluster' == cluster_num
    indices = df_combined[df_combined['cluster'] == cluster_num].index

    # Extract embeddings for these indices
    embeddings_cluster = all_embeddings[indices]

    # Perform KMeans clustering with n_clusters=1 to find the centroid for the current cluster
    kmeans = KMeans(n_clusters=1, random_state=42).fit(embeddings_cluster)
    centroid = kmeans.cluster_centers_[0]

    # Calculate distance from each document's embedding in the current cluster to its centroid
    distances = cdist(embeddings_cluster, [centroid], 'euclidean').flatten()

    # Update the 'distance' column for rows belonging to the current cluster
    df_combined.loc[indices, 'distance_to_its_centroid'] = distances


In [None]:
df_combined = df_combined.sort_values(by=['cluster', 'distance_to_its_centroid'], ascending=[True, True]).reset_index(drop = True)

In [None]:
df_combined

In [None]:
for review in df_combined[df_combined.cluster == 1].head(50)['content']:
    print('\n', review)

In [None]:
save_dir = '../dataset/phase 4/'
file_path = os.path.join(save_dir, 'topic_modelling_result.xlsx')

df_combined.to_excel(file_path, index=False)

# 8. Counting Total Reviews Based on Cluster

In [None]:
import pandas as pd
import time
import os

base_dir = '../dataset/phase 4/'
file_path = os.path.join(base_dir, 'topic_modelling_result.xlsx')

df = pd.read_excel(file_path)

In [None]:
df.groupby('cluster').count().reset_index()[['cluster', 'app']].to_clipboard(index=False)