## 1. Library Installation

In [None]:
!pip install -q bertopic

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for hdbscan (pyproject.toml) ... [?25l[?25hdone
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone


# 2. Reading Dataset

In [1]:
from bertopic import BERTopic
import pandas as pd
import time
import os

base_dir = '../dataset/phase 3/'
file_path = os.path.join(base_dir, 'topic_modelling_dataset.csv')

df = pd.read_csv(file_path)

# 3. Topic Modelling

In [None]:
import time
from bertopic import BERTopic

def run_topic_modeling(df, column, min_topic_size, min_nr_topics=None):
    """
    Perform topic modeling on a specified column in a pandas DataFrame using BERTopic.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the textual data.
    - column (str): The name of the column in `df` that contains the text to be analyzed.
    - min_topic_size (int): The minimum size of the topics.
    - min_nr_topics (int, optional): The minimum number of topics to model. If None, the number of topics
      will be determined based on the data and `min_topic_size`.

    Returns:
    - tuple: A tuple containing the following elements:
        - topic_info (pandas.DataFrame): A DataFrame with information about the topics found in the data.
        - df (pandas.DataFrame): The original DataFrame updated with a 'cluster' column indicating the topic
          assignment for each document.

    The function also prints the time taken for the topic modeling process.
    """
    min_topic_size = min_topic_size
    embedding_model = "all-MiniLM-L6-v2"

    if min_nr_topics:
      topic_model = BERTopic(nr_topics=min_nr_topics, embedding_model=embedding_model, verbose=True)
    else:
      topic_model = BERTopic(min_topic_size=min_topic_size, embedding_model=embedding_model, verbose=True)

    start_time = time.time()
    df[column] = df[column].astype(str)
    topics, probabilities = topic_model.fit_transform(df[column])
    end_time = time.time()

    time_taken = end_time - start_time
    print(f"Time taken for training: {time_taken} seconds")

    topic_info = topic_model.get_topic_info()
    df['cluster'] = topics

    return topic_info, df


In [None]:
topic_info, result = run_topic_modeling(df, 'content_short', 100)

2024-02-15 22:02:25,148 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/881 [00:00<?, ?it/s]

2024-02-15 22:04:55,075 - BERTopic - Embedding - Completed ✓
2024-02-15 22:04:55,081 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 22:05:33,315 - BERTopic - Dimensionality - Completed ✓
2024-02-15 22:05:33,318 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 22:05:40,356 - BERTopic - Cluster - Completed ✓
2024-02-15 22:05:40,371 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 22:05:40,705 - BERTopic - Representation - Completed ✓


Time taken for training: 195.91313433647156 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9128,-1_it_very_no_not,"[it, very, no, not, to, you, and, accurate, th...","[So great and easy to use, Easy and a good.to ..."
1,0,2571,0_app_this_great_apps,"[app, this, great, apps, application, love, go...","[app, app, app]"
2,1,1647,1_earthquake_earthquakes_quakes_quake,"[earthquake, earthquakes, quakes, quake, of, t...","[I like to know where the earthquake are, Eart..."
3,2,1107,2_weather_hurricane_storms_hurricanes,"[weather, hurricane, storms, hurricanes, storm...","[weather app, weather app, weather app]"
4,3,941,3_information_easy_info_use,"[information, easy, info, use, simple, informa...","[Easy to use, accurate, great information, It’..."
5,4,851,4_alerts_alert_not_getting,"[alerts, alert, not, getting, get, to, an, ale...","[I like the alerts, It not alert, 0 alerts]"
6,5,682,5_muy_nicht_la_language,"[muy, nicht, la, language, de, buena, english,...","[Muy buena aplicación, Muy buena aplicación, G..."
7,6,600,6_notifications_notification_notify_notified,"[notifications, notification, notify, notified...","[love the notifications, get no notifications,..."
8,7,568,7_better_good_best_so,"[better, good, best, so, it, improve, far, do,...","[It is better, good as can be one of the best,..."
9,8,567,8_map_maps_satellite_the,"[map, maps, satellite, the, zoom, view, google...","[map, map, map]"


In [None]:
result

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Working as a Public Health Nurse I get to resp...,5,I may have lost and/or procured gadgets to aid...,-1
1,Disaster Alert,Nice to have before traveling to unknown terri...,5,Nice to have before traveling to unknown terri...,-1
2,Disaster Alert,I like! I'm trying to find anything about tsun...,5,tsunami,-1
3,Disaster Alert,good to have but what options are expected in ...,5,good to have,-1
4,Disaster Alert,Shows hazards all right but refuses to send no...,2,Useless to me without notifications,6
...,...,...,...,...,...
28156,Earthquake,It works quite well even anticipates some othe...,5,works quite well,42
28157,Earthquake,This application is very good.,5,This application is very good,0
28158,Earthquake,Data from earthquakes in Chile in the last 24 ...,2,the application is not updating the telluric a...,0
28159,Earthquake,This is as good as earthquake apps can go. Thi...,5,detailed info on many earthquakes that even ot...,13


In [None]:
save_dir = '../dataset/phase 4/'
file_path_1 = os.path.join(save_dir, 'topic_info.xlsx')
file_path_2 = os.path.join(save_dir, 'raw_topic_modelling_result.xlsx')

topic_info.to_excel(file_path_1, index=False)
result.to_excel(file_path_2, index=False)

## 3. Combine Similar Topics

In [None]:
import pandas as pd

base_dir = '../dataset/phase 4/'
file_path = os.path.join(base_dir, 'raw_topic_modelling_result.xlsx')

result = pd.read_excel(file_path)

In [None]:
# Define clusters and their corresponding topics
clusters = {
    1: [0, 3, 7, 30, 41, 42, 43, 49, 50],
    2: [27, 28, 32],
    3: [1, 2, 13, 14, 22, 35, 48],
    4: [4, 6, 9, 12, 19, 26, 37, 53],
    5: [11],
    6: [15, 36],
    7: [24],
    8: [16],
    9: [18],
    10: [20, 44],
    11: [33, 46],
    12: [23],
    13: [34],
    14: [40],
    15: [38],
    16: [8],
    17: [39],
    18: [10],
    19: [5],
    20: [17],
    21: [21, 25, 52],
    22: [51]
}

# Generate a mapping dictionary from topics to clusters
topic_to_cluster = {topic: cluster for cluster, topics in clusters.items() for topic in topics}

result['cluster'] = result['cluster'].map(lambda x: topic_to_cluster.get(x, 0))
result

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Working as a Public Health Nurse I get to resp...,5,I may have lost and/or procured gadgets to aid...,0
1,Disaster Alert,Nice to have before traveling to unknown terri...,5,Nice to have before traveling to unknown terri...,0
2,Disaster Alert,I like! I'm trying to find anything about tsun...,5,tsunami,0
3,Disaster Alert,good to have but what options are expected in ...,5,good to have,0
4,Disaster Alert,Shows hazards all right but refuses to send no...,2,Useless to me without notifications,4
...,...,...,...,...,...
28156,Earthquake,It works quite well even anticipates some othe...,5,works quite well,1
28157,Earthquake,This application is very good.,5,This application is very good,1
28158,Earthquake,Data from earthquakes in Chile in the last 24 ...,2,the application is not updating the telluric a...,1
28159,Earthquake,This is as good as earthquake apps can go. Thi...,5,detailed info on many earthquakes that even ot...,3


# 4. Refining Cluster

## 4.1. Cluster 1

In [None]:
subset_df = result[result.cluster == 1]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 22:53:31,170 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/467 [00:00<?, ?it/s]

2024-02-15 22:58:30,959 - BERTopic - Embedding - Completed ✓
2024-02-15 22:58:30,962 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 22:58:46,122 - BERTopic - Dimensionality - Completed ✓
2024-02-15 22:58:46,124 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 22:58:46,983 - BERTopic - Cluster - Completed ✓
2024-02-15 22:58:46,994 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 22:58:47,773 - BERTopic - Representation - Completed ✓


Time taken for training: 316.9733695983887 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4843,-1_it_to_the_app,"[it, to, the, app, and, is, this, you, very, for]","[it is a very good app, i use it almost every ..."
1,0,2389,0_earthquake_earthquakes_the_in,"[earthquake, earthquakes, the, in, and, to, it...","[Good app to the earthquake, I always keep the..."
2,1,1353,1_alerts_the_notifications_alert,"[alerts, the, notifications, alert, to, and, a...",[Update: This update does nothing to address h...
3,2,813,2_open_it_crashing_app,"[open, it, crashing, app, working, crashes, up...",[The latest update to this app has an issue. T...
4,3,802,3_app_this_great_love,"[app, this, great, love, very, good, easy, use...","[This App is Very Good , This is a great i lov..."
5,4,590,4_fire_fires_the_is,"[fire, fires, the, is, to, app, and, of, on, t...",[It sends a notification about a fire but the ...
6,5,574,5_hurricane_weather_the_and,"[hurricane, weather, the, and, app, storms, hu...",[I've tried a lot of hurricane apps and this o...
7,6,360,6_in_app_rico_puerto,"[in, app, rico, puerto, live, this, world, gre...",[I like this app I know what's going on in the...
8,7,306,7_informative_app_information_info,"[informative, app, information, info, very, gr...","[This app is great and informative., This is a..."
9,8,285,8_easy_use_informative_to,"[easy, use, informative, to, and, very, simple...","[Very informative and easy to use!, Easy to us..."


In [None]:
df_1 = result_subset_df[result_subset_df.cluster.isin([4, 7, 8, 10, 12, 14, 15, 16, 18, 28])].reset_index(drop = True)
df_1['cluster'] = 1
df_1

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Working as a Public Health Nurse I get to resp...,5,I may have lost and/or procured gadgets to aid...,1
1,Disaster Alert,It's a good app but not so useful. I was expec...,3,not so useful,1
2,Disaster Alert,I love natural disasters they're so fascinatin...,5,autism,1
3,Disaster Alert,Thanks to the creator of this app..it was very...,5,..,1
4,Disaster Alert,Very accurate and very useful. More power God ...,5,Very accurate and very useful,1
...,...,...,...,...,...
2189,Earthquake,"Wonderful tool, ideal complement of astrocarto...",5,"Wonderful tool, ideal complement of astrocarto...",1
2190,Earthquake,I like to supply all the necessary information,5,I like to supply all the necessary information,1
2191,Earthquake,"It's good, but it takes a while to astualise.",3,"It's good, but it takes a while to astualise",1
2192,Earthquake,Everything very explicit very good,5,Everything very explicit very good,1


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_1.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_1.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_1.to_excel(file_path_2, index=False)

## 4.2 Cluster 2

In [None]:
subset_df = result[result.cluster == 2]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:07:36,711 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

2024-02-15 23:07:56,484 - BERTopic - Embedding - Completed ✓
2024-02-15 23:07:56,488 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:07:59,848 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:07:59,850 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:07:59,884 - BERTopic - Cluster - Completed ✓
2024-02-15 23:07:59,892 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:07:59,930 - BERTopic - Representation - Completed ✓


Time taken for training: 23.246668100357056 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,81,-1_the_useless_app_it,"[the, useless, app, it, is, of, and, to, in, t...",[Everytime a new update is done it always effe...
1,0,376,0_the_it_to_and,"[the, it, to, and, app, this, in, useless, for...",[I loved this app until I tried to transfer it...
2,1,102,1_app_this_to_good,"[app, this, to, good, it, very, is, great, exc...",[This is the best app.. you need to install th...


In [None]:
for rep in topic_info.Representation:
  print(rep)

['the', 'useless', 'app', 'it', 'is', 'of', 'and', 'to', 'in', 'this']
['the', 'it', 'to', 'and', 'app', 'this', 'in', 'useless', 'for', 'of']
['app', 'this', 'to', 'good', 'it', 'very', 'is', 'great', 'excellent', 'so']


In [None]:
df_2 = result_subset_df[result_subset_df.cluster.isin([-1, 0])].reset_index(drop = True)
df_2['cluster'] = 2
df_2

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Thanks to the creator of this app..it was very...,5,..,2
1,Disaster Alert,This thing will not uninstall itself from my p...,1,This thing will not uninstall itself from my p...,2
2,Disaster Alert,By far one of the most informative information...,5,!!..,2
3,Disaster Alert,Uninstalled. Instists I already have account y...,1,Uninstalled,2
4,Disaster Alert,This app is a Disaster itself! Uninstalled!,1,Uninstalled,2
...,...,...,...,...,...
452,Alertswiss,Today on 13.3. three hours after the last pres...,1,useless,2
453,Earthquake,I loved this app until I tried to transfer it ...,5,I had to rebuy the pro version,2
454,Earthquake,This app shows tectonic plates boundaries whic...,5,…..,2
455,Earthquake,I’m able to analyze and see within minutes of ...,5,...,2


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_2.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_2.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_2.to_excel(file_path_2, index=False)

## 4.3. Cluster 3

In [None]:
subset_df = result[result.cluster == 3]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:09:05,008 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/126 [00:00<?, ?it/s]

2024-02-15 23:10:16,551 - BERTopic - Embedding - Completed ✓
2024-02-15 23:10:16,557 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:10:43,975 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:10:43,985 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:10:44,295 - BERTopic - Cluster - Completed ✓
2024-02-15 23:10:44,307 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:10:44,605 - BERTopic - Representation - Completed ✓


Time taken for training: 99.71659851074219 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2551,0_the_to_it_app,"[the, to, it, app, earthquake, and, earthquake...",[My spindle and our 2 small Daughters survived...
1,1,1080,1_the_weather_app_to,"[the, weather, app, to, and, hurricane, it, fo...",[This does what I need it to do. It's about st...
2,2,374,2_fire_the_to_fires,"[fire, the, to, fires, and, app, it, of, is, in]",[This is a great app that really helps me unde...


In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

['the', 'to', 'it', 'app', 'earthquake', 'and', 'earthquakes', 'in', 'of', 'this'] 

['the', 'weather', 'app', 'to', 'and', 'hurricane', 'it', 'for', 'this', 'of'] 

['fire', 'the', 'to', 'fires', 'and', 'app', 'it', 'of', 'is', 'in'] 



In [None]:
df_3 = result_subset_df.reset_index(drop = True)
df_3['cluster'] = 3
df_3

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,I enjoy the app informing me about potential c...,4,I enjoy the app informing me about potential c...,3
1,Disaster Alert,Disaster is a very strong and very descriptive...,1,I should have been a meteorologist,3
2,Disaster Alert,I've only had this app on my phone a couple of...,5,I've been aware of tornados and storms before ...,3
3,Disaster Alert,Was great for the most part but didn't detect ...,3,didn't detect a winter storm in my local area ...,3
4,Disaster Alert,View is awesome. Product is difficult to work ...,4,fires show up even when you turn it off. The s...,3
...,...,...,...,...,...
4000,Earthquake,Certain when you know the earthquake and its m...,5,Certain when you know the earthquake and its m...,3
4001,Earthquake,Very good app. It keeps us informed in time of...,5,Very good app. It keeps us informed in time of...,3
4002,Earthquake,It is very good and fast!! It just goes withou...,4,an earthquake is going to happen before it rea...,3
4003,Earthquake,Very good application to use it to be aware of...,5,Very good application to use it to be aware of...,3


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_3.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_3.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_3.to_excel(file_path_2, index=False)

## 4.4. Cluster 4

In [None]:
subset_df = result[result.cluster == 4]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:11:56,099 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/98 [00:00<?, ?it/s]

2024-02-15 23:12:58,199 - BERTopic - Embedding - Completed ✓
2024-02-15 23:12:58,201 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:13:15,302 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:13:15,304 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:13:15,450 - BERTopic - Cluster - Completed ✓
2024-02-15 23:13:15,458 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:13:15,598 - BERTopic - Representation - Completed ✓


Time taken for training: 79.58853435516357 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,510,-1_the_to_it_and,"[the, to, it, and, notifications, alerts, of, ...",[The means by which you can modify the alerts ...
1,0,688,0_notifications_the_to_it,"[notifications, the, to, it, alerts, app, and,...",[Previously used this app on a Samsung Galaxy ...
2,1,671,1_date_up_to_updates,"[date, up, to, updates, and, it, the, update, ...",[This is up to date for the accurate informati...
3,2,538,2_earthquake_the_it_to,"[earthquake, the, it, to, and, app, earthquake...",[What's the use of the app if you send the ale...
4,3,434,3_the_to_for_and,"[the, to, for, and, alerts, app, of, is, in, it]",[I received a Tsunami warning for my area over...
5,4,142,4_fire_the_to_fires,"[fire, the, to, fires, app, is, and, it, in, w...",[Good app but I am getting up to 50+ notificat...
6,5,82,5_app_alerts_the_alert,"[app, alerts, the, alert, this, to, it, and, g...",[The app seems pretty good although I've not h...
7,6,64,6_warnings_warning_the_app,"[warnings, warning, the, app, this, of, you, b...",[Some have complained about the warnings comin...


In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

['the', 'to', 'it', 'and', 'notifications', 'alerts', 'of', 'app', 'for', 'is'] 

['notifications', 'the', 'to', 'it', 'alerts', 'app', 'and', 'not', 'but', 'alert'] 

['date', 'up', 'to', 'updates', 'and', 'it', 'the', 'update', 'app', 'very'] 

['earthquake', 'the', 'it', 'to', 'and', 'app', 'earthquakes', 'in', 'of', 'this'] 

['the', 'to', 'for', 'and', 'alerts', 'app', 'of', 'is', 'in', 'it'] 

['fire', 'the', 'to', 'fires', 'app', 'is', 'and', 'it', 'in', 'watch'] 

['app', 'alerts', 'the', 'alert', 'this', 'to', 'it', 'and', 'good', 'great'] 




In [None]:
df_4 = result_subset_df.reset_index(drop = True)
df_4['cluster'] = 4
df_4

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Shows hazards all right but refuses to send no...,2,Useless to me without notifications,4
1,Disaster Alert,Good app. Notifications are clear. Easy to use.,4,Notifications are clear,4
2,Disaster Alert,It's good but it doesn't update fast,4,it doesn't update fast,4
3,Disaster Alert,I love this app. Needs a defcon meter alert so...,5,Needs a defcon meter alert,4
4,Disaster Alert,Great app could be better on alerts and would ...,5,alerts and would be cool if you can set alerts,4
...,...,...,...,...,...
3124,Earthquake,If alerts arrive but not immediately,5,alerts arrive but not immediately,4
3125,Earthquake,Your notifications are 20 to 30 minutes after ...,5,can't be alert,4
3126,Earthquake,Every time I check I have to re-enter the sear...,1,I don’t want notifications for Indonesia,4
3127,Earthquake,Maximum in up-to-date information from all ove...,5,up-to-date information from all over the world.,4


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_4.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_4.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_4.to_excel(file_path_2, index=False)

## 4.5. Cluster 5

In [None]:
subset_df = result[result.cluster == 5]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:14:26,227 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

2024-02-15 23:14:42,120 - BERTopic - Embedding - Completed ✓
2024-02-15 23:14:42,124 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:14:45,602 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:14:45,608 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:14:45,646 - BERTopic - Cluster - Completed ✓
2024-02-15 23:14:45,656 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:14:45,720 - BERTopic - Representation - Completed ✓


Time taken for training: 19.536670446395874 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,136,-1_the_it_to_and,"[the, it, to, and, sound, in, but, of, doesn, is]",[I love this app but when alarm goes off it’s ...
1,0,190,0_the_to_it_app,"[the, to, it, app, and, earthquake, of, alarm,...",[I downloaded the app that I had been using be...
2,1,136,1_sound_the_to_it,"[sound, the, to, it, app, notification, but, y...",[Great app but the notification sound is loud ...


In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

['I love this app but when alarm goes off it’s not very loud for when you might be sleeping. My volume is u all the way and it’s just a short bleep. Not sure how this would help older hard of hearing people because sound is not alerting like an amber alert is. Maybe change the sound of the alert or make it louder as it sounds like I got mail or a text...  Thank You', "For the comments about setting the volume of the alarm the volume used by this app is the same that is used for media such as listening to music which is different than the ringtone volume. Going to Settings>Sound>Volumes and setting the Music video games & other media volume to zero will mute the alarm.  But that issue aside I'm finding similar issues that others are having such as very sluggish interface and high CPU usage. Watchdog reports CodeRED's CPU usage around 10-35% even while the app is running in the background. [UPDATE: This is the performance I'm getting at home on wifi with GPS turned off.]  Uninstalling bu

In [None]:
df_5 = result_subset_df.reset_index(drop = True)
df_5['cluster'] = 5
df_5

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Not what I expected based on the description. ...,3,aren't loud enough to alert anyone to anything,5
1,Disaster Alert,•inter.stor. *ONLY* for: widgets; boot/auto la...,3,Main App GUI: .JPG .WAV media libs,5
2,Disaster Alert,Just got it. Let you know if it deserves 5 sta...,4,Doesn't sound for any alerts,5
3,Earthquake Alert!,Works great. Very happy. On S22 Ultra cannot s...,5,cannot set sound notification,5
4,Earthquake Alert!,Wow this is a great app to keep track of earth...,5,almost hitting alarm sounded,5
...,...,...,...,...,...
457,Earthquake,It would be good if you had the option to rece...,5,sound alerts,5
458,Earthquake,Good APP has helped us a lot Hundred not being...,5,not being able to modify the warning sound,5
459,Earthquake,"It could get better, as the alarm doesn't ring...",3,the alarm doesn't ring,5
460,Earthquake,Just need to have the alarm.,5,Just need to have the alarm,5


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_5.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_5.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_5.to_excel(file_path_2, index=False)

## 4.6. Cluster 6

In [None]:
subset_df = result[result.cluster == 6]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:15:56,791 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

2024-02-15 23:16:06,684 - BERTopic - Embedding - Completed ✓
2024-02-15 23:16:06,688 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:16:09,907 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:16:09,909 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:16:09,932 - BERTopic - Cluster - Completed ✓
2024-02-15 23:16:09,938 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:16:09,967 - BERTopic - Representation - Completed ✓


Time taken for training: 13.203384637832642 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,78,-1_to_the_is_know,"[to, the, is, know, going, this, what, app, it...","[Good to know what's going on, Good to know wh..."
1,0,184,0_the_what_to_world,"[the, what, to, world, earth, in, on, happenin...",[Awesome to see what is going on in the world ...
2,1,162,1_app_the_and_this,"[app, the, and, this, to, it, is, of, in, on]",[We've had a number of larger earthquakes in C...
3,2,75,2_informed_me_keeps_it,"[informed, me, keeps, it, you, for, thank, the...","[Thank you for keeping me informed, Keeps me i..."


In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

["Good to know what's going on", 'Good to know what is going on around.', 'It’s always great to know what is going on with this'] 

['Awesome to see what is going on in the world around you!!!', "I like to know what's happening around the world.", "It's good know around the world about what is happening to Earth."] 

["We've had a number of larger earthquakes in California lately so I downloaded Earthquake Alert! and I'm enjoying it. Sometimes I can't tell if I'm imagining a quake or if it's really happening so it's nice to be able to open this app and quickly see what is going on. I set it to show only quakes near me at 3.0 or above. Otherwise it will be information overload with activity all over the world. I've recommended this app to friends and coworkers and they like it too!", "What a great app! It helps me feel more aware of any threatening activity, so I can do my best to be prepared. I check it every day and it's been up to date and is a very easy app to use. I just like knowi

In [None]:
df_6 = result_subset_df.reset_index(drop = True)
df_6['cluster'] = 6
df_6

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Good background for what's going on,5,Good background for what's going on,6
1,Disaster Alert,If you were not aware that so many disasters h...,5,It is a global perspective and not for the fai...,6
2,Disaster Alert,I love this app .it tells me whats going on ar...,5,tells me whats going on around the world,6
3,Disaster Alert,I really have a need to be updated on world ev...,5,I really have a need to be updated on world ev...,6
4,Disaster Alert,Very eye opening the world just gets smaller. ...,5,the world just gets smaller,6
...,...,...,...,...,...
494,Earthquake,Thank you so much for keeping me informed and ...,5,Thank you so much for keeping me informed,6
495,Earthquake,It keeps us well informed.,5,It keeps us well informed,6
496,Earthquake,I like it because I'm always informed when the...,5,I like it because I'm always informed,6
497,Earthquake,EXCELLENT application to keep me informed.,5,keep me informed,6


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_6.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_6.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_6.to_excel(file_path_2, index=False)

## 4.7. Cluster 7

In [None]:
subset_df = result[result.cluster == 7]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:17:24,747 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2024-02-15 23:17:35,766 - BERTopic - Embedding - Completed ✓
2024-02-15 23:17:35,768 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:17:38,351 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:17:38,353 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:17:38,367 - BERTopic - Cluster - Completed ✓
2024-02-15 23:17:38,374 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:17:38,402 - BERTopic - Representation - Completed ✓


Time taken for training: 13.678303718566895 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,95,-1_the_and_app_it,"[the, and, app, it, this, to, for, is, in, of]",[Better than I ever thought even picked up Bom...
1,0,66,0_the_app_to_and,"[the, app, to, and, this, of, it, is, in, on]",[I had the app on my phone so was confused whe...
2,1,53,1_gt_to_safe_and,"[gt, to, safe, and, you, it, of, the, is, good]","[Great way to be safe for family, It's not pre..."


In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

["Better than I ever thought even picked up Bomb Blast in North Korea it registered between 5.9 and 6 on the Richter scale and show the time when it happened I'm quite impressed/ keeping up with the events in Hawaii to the tragic event that occurred with Krakatoa this app continues to amaze me I wanted something like this app because we have now entered into the 21st century what I like to call century of the volcanic Great Awakening and there are much bigger ones to come so keep safe out there.", 'This app is one of the best ones because it tells you where the earthquake is and what is the high to low it is because I started using the app when I had big earthquakes in Idaho and felt it I knew I needed a safety net and this app was it', 'This app is wonderful and amazing. With me being able to know when an earthquake will hit in California, I feel much more safe! And now, when my family and I are at the beach, we will be checking the app for updated events and tsunamis. Thanks you!'] 


In [None]:
df_7 = result_subset_df.reset_index(drop = True)
df_7['cluster'] = 7
df_7

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,You be safe the disaster,5,You be safe,7
1,Disaster Alert,Safe and Alert!!! You Rock!!,5,Safe and Alert!!! You Rock!!,7
2,Disaster Alert,Thats a no no for our identity safety! Peopl...,2,a no no for our identity safety,7
3,Disaster Alert,Great app very informative and helpful. Inform...,5,Would like it if it instructed us on the safes...,7
4,Disaster Alert,GET THIS APP I REPEAT GET THIS APP (it will pr...,5,it will protects us from 2012,7
...,...,...,...,...,...
209,Alertswiss,A very useful and important app Mostly because...,4,data protection,7
210,Alertswiss,"Covid indoctrination does not belong here, bec...",1,99% of the population is not threatened,7
211,Earthquake,This app is wonderful and amazing. With me bei...,5,safe,7
212,Earthquake,I got this app shortly after going through a 7...,5,You will never be the same after going through...,7


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_7.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_7.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_7.to_excel(file_path_2, index=False)

## 4.8. Cluster 8

In [None]:
subset_df = result[result.cluster == 8]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:19:45,051 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2024-02-15 23:19:54,481 - BERTopic - Embedding - Completed ✓
2024-02-15 23:19:54,484 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:19:57,679 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:19:57,680 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:19:57,698 - BERTopic - Cluster - Completed ✓
2024-02-15 23:19:57,704 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:19:57,732 - BERTopic - Representation - Completed ✓


Time taken for training: 12.706021547317505 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,145,-1_the_to_and_it,"[the, to, and, it, dark, is, of, east, on, in]",[The color combinations black with Dark blue i...
1,0,109,0_to_the_and_app,"[to, the, and, app, it, features, very, use, i...",[I live in Anchorage Alaska and have been usin...
2,1,82,1_the_to_and_in,"[the, to, and, in, is, app, it, you, for, this]",[Edit. What use is this worthless app if it do...


In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

['The color combinations black with Dark blue is terrible. I would’ve given it five stars if you could actually read some of the menu labels at the bottom of the screen.  My eyesight is limited to one eye, and about 65 to 70% vision out of that eye. I need good contrast to be able to see clearly.', 'It has my location wrong and the only way to edit it is by expanding the map and moving it around. Very clumsy and quite unsuitable for many users. It needs the ability to type in an address. Also it appears to be impossible to delete a watch zone. Needs work on the UX to make it more intuitive. Otherwise huge potential to be a very useful tool.', "Love this app but the new dark mode support needs to have a separate on off mode many of my apps I like the dark mode but not this one. Please add a separate dark/light switch it's too dark to see the colors"] 

["I live in Anchorage Alaska and have been using your app since my first smartphone which was only about 2 or 3 yrs ago no computer or l

In [None]:
df_8 = result_subset_df[result_subset_df.cluster.isin([-1, 1])].reset_index(drop = True)
df_8['cluster'] = 8
df_8

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Needs dark mode...unable to properly evaluate.,1,Needs dark mode,8
1,Disaster Alert,Current weather conditions Great layout,5,Great layout,8
2,Disaster Alert,Very ugly UI plus sometimes I stuck and I cann...,1,Very ugly UI,8
3,Disaster Alert,Widely divide buttons at upper right and botto...,4,Widely divide buttons at upper right and botto...,8
4,Disaster Alert,Love it but what does the red dots mean? Is th...,5,red dots,8
...,...,...,...,...,...
222,Earthquake,The thing I like best about this app is how ac...,5,blind or visually impaired,8
223,Earthquake,Please tell me what does the red and yellow in...,4,red and yellow indicator,8
224,Earthquake,The legend is too small to read on a smart phone.,1,too small to read on a smart phone,8
225,Earthquake,This app has worked very well for me. It updat...,5,Easy to use and has a nice layout,8


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_8.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_8.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_8.to_excel(file_path_2, index=False)

## 4.9. Cluster 9

In [None]:
subset_df = result[result.cluster == 9]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:20:54,099 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2024-02-15 23:21:04,408 - BERTopic - Embedding - Completed ✓
2024-02-15 23:21:04,411 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:21:08,301 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:21:08,304 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:21:08,319 - BERTopic - Cluster - Completed ✓
2024-02-15 23:21:08,324 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:21:08,351 - BERTopic - Representation - Completed ✓


Time taken for training: 14.281160116195679 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,246,-1_the_to_it_and,"[the, to, it, and, app, settings, you, is, but...",[But the GUI is not really easy to use and doe...


In [None]:
df_9 = result_subset_df.reset_index(drop = True)
df_9['cluster'] = 9
df_9

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Doesnt save my settings and tries to load way ...,3,Doesnt save my settings and tries to load way ...,9
1,Disaster Alert,Wish it would save my layers so I dont have to...,3,Wish it would save my layers,9
2,Disaster Alert,A fire in Las Vegas has been raging for a week...,1,its not showing,9
3,Disaster Alert,Was great but the new update they fprgot the m...,1,No menu,9
4,Disaster Alert,This is a great app unfortunately it only half...,1,only half fills the display,9
...,...,...,...,...,...
241,Earthquake,"So far, best info I’ve seen but in order to us...",1,best info I’ve seen but in order to use settings,9
242,Earthquake,Always have to reset settings,1,Always have to reset settings,9
243,Earthquake,Setting options could be better. There seem to...,3,Setting options could be better,9
244,Earthquake,"Cannot change settings, only if you pay for th...",1,Cannot change settings,9


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_9.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_9.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_9.to_excel(file_path_2, index=False)

## 4.10. Cluster 10

In [None]:
subset_df = result[result.cluster == 10]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:21:39,725 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

2024-02-15 23:21:53,984 - BERTopic - Embedding - Completed ✓
2024-02-15 23:21:53,992 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:21:57,139 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:21:57,142 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:21:57,174 - BERTopic - Cluster - Completed ✓
2024-02-15 23:21:57,180 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:21:57,221 - BERTopic - Representation - Completed ✓


Time taken for training: 17.533490419387817 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,142,-1_gt_the_to_it,"[gt, the, to, it, and, slow, is, of, be, but]",[Love the app would give it a 5 star except th...
1,0,130,0_it_to_app_slow,"[it, to, app, slow, and, the, is, this, but, v...",[A small subset of the original. Very slow. No...
2,1,91,1_the_to_and_app,"[the, to, and, app, is, it, of, this, in, on]",[Have also noticed an impact on battery life s...


In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

['Love the app would give it a 5 star except the app is oh so very very slow to load the maps its takes around five minutes to get a map to focus then another 4 or 5 minutes each time i zoom in most of the time i just give up. Great on reporting earthquakes and volvanos tho. Fix the issue i have and i well gradly rate it a 10 if i could.', "I receive alerts but when I tap on the notification and then the app comes  up it keeps on showing 0 Alerts. It seems the app is very slow to respond  upon receiving the alert. It's annoying that I have to wait for few minutes  just to view the full alert.", "It's really important because it's already in the ground, and it's going to be so fast! &gt;&gt;It's going to be so fast! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;It's going to be great! &gt;&gt;&gt;&gt;&gt; It's going to be great! &gt;&g

In [None]:
df_10 = result_subset_df.reset_index(drop = True)
df_10['cluster'] = 10
df_10

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,To laggy and glitchy for any real use potentia...,2,laggy and glitchy,10
1,Disaster Alert,2022-01-02 . Old Version 3.2 Faster,3,Faster,10
2,Disaster Alert,I like this app. It does what it needs to. Not...,5,Nothing flashy but to the point,10
3,Disaster Alert,Its kinda slow when I move the map and it take...,4,Its kinda slow,10
4,Disaster Alert,A little clunky but ok,3,A little clunky,10
...,...,...,...,...,...
358,Earthquake,The only problem I have is there is at least a...,4,there is at least a 10 minute lag time,10
359,Earthquake,App needs to be faster in showing the earthqua...,3,App needs to be faster,10
360,Earthquake,It doesn't update the earthquake fast. It's to...,2,It's too slow,10
361,Earthquake,We felt quake in Kathmandu but got latest upda...,2,Too slow,10


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_10.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_10.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_10.to_excel(file_path_2, index=False)

## 4.11. Cluster 11

In [None]:
subset_df = result[result.cluster == 11]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:22:58,354 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-02-15 23:23:06,369 - BERTopic - Embedding - Completed ✓
2024-02-15 23:23:06,374 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:23:09,081 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:23:09,087 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:23:09,109 - BERTopic - Cluster - Completed ✓
2024-02-15 23:23:09,118 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:23:09,145 - BERTopic - Representation - Completed ✓


Time taken for training: 10.817887306213379 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,125,-1_it_the_to_app,"[it, the, to, app, and, on, open, but, not, when]",[Love the app and info it provides but over pa...
1,0,100,0_the_crashes_app_it,"[the, crashes, app, it, to, and, is, for, but,...",[I have had this app for a while and I love th...
2,1,51,1_it_open_to_and,"[it, open, to, and, app, now, keeps, this, clo...",[This app used to be good but now it doesn't e...


In [None]:
for rep in topic_info.Representative_Docs:
  print(rep,'\n')

["Love the app and info it provides but over past few days it will instantly close when opened. I came here to see if there was an update but I seem to have the latest version. Not sure why it's doing this when it worked so well for such a long time.", "I used to be a bibolic notification, but I was going to make it sound when I set it up, but it's going to be a small shake, and it's going to happen in the end, even when I'm sleeping at night, so I'm going to have to shut it down. But I want you to improve the settings for notifications, and it's easy to use.", 'When I get a notification of an event it does not tell me the location. When I expand that notification (clicking only once) I still don’t get it. Only when I fully open it up do I get a location. I have 5 locations set and don’t get alerts on all of them. I am very appreciative of being able to check on the fuzzy safety of my family and friends, but with all the bizarre weather we have been having, these alerts are mind blowin

In [None]:
df_11 = result_subset_df.reset_index(drop = True)
df_11['cluster'] = 11
df_11

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Good app idk what people are talking about. It...,5,It hasn't crashed on me or acted dumb,11
1,Disaster Alert,Pretty low grade. Hard to navigate through. Sl...,1,Slow and crashes,11
2,Disaster Alert,I use Oppo F1s and it's so long to open. Maybe...,1,it's so long to open,11
3,Disaster Alert,was a good app till the update now. doesnt wan...,1,doesnt want to open even,11
4,Disaster Alert,Same complaints as everyone else you open a di...,1,stops/closes,11
...,...,...,...,...,...
271,Emergency: Severe Weather App,Crashes when opening on iPhone 6 & iPad mini,1,Crashes,11
272,National evacuation center guide,It doesn't start all the time.,1,It doesn't start all the time,11
273,Alertswiss,The app is great and will save lives sooner or...,3,I will no longer enter it,11
274,Alertswiss,The app crashes if you want to add a relative ...,2,crashes,11


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_11.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_11.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_11.to_excel(file_path_2, index=False)

## 4.12. Cluster 12

In [None]:
subset_df = result[result.cluster == 12]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:23:57,529 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2024-02-15 23:24:03,644 - BERTopic - Embedding - Completed ✓
2024-02-15 23:24:03,647 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:24:07,229 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:24:07,232 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:24:07,244 - BERTopic - Cluster - Completed ✓
2024-02-15 23:24:07,251 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:24:07,273 - BERTopic - Representation - Completed ✓


Time taken for training: 9.762619972229004 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,215,-1_it_the_to_work,"[it, the, to, work, app, and, not, this, in, you]",[Last year I got a ticket for a parking ban du...


In [None]:
df_12 = result_subset_df.reset_index(drop = True)
df_12['cluster'] = 12
df_12

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Its not working TRUMP'as a fake,5,Its not working TRUMP,12
1,Disaster Alert,Dosn't work most of the time,4,Dosn't work,12
2,Earthquake Alert!,I have tried them all and this is the best one...,5,it didn't work,12
3,Earthquake Alert!,Informative current concise. The map link is v...,5,Links fixed,12
4,Earthquake Alert!,I had this app due to the ridgecrest earthquak...,5,didn't work,12
...,...,...,...,...,...
210,Alertswiss,The push messages don't work. App is usedless...,1,don't work,12
211,Alertswiss,"‹Allow location ALWAYS›, even when not using t...",1,does not work at all,12
212,Alertswiss,"The app would have to be set up, but it doesn'...",1,it doesn't work,12
213,Earthquake,I just downgraded from 2 stars to 1. I would g...,1,The customer support link is not functional,12


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_12.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_12.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_12.to_excel(file_path_2, index=False)

## 4.13. Cluster 13

In [None]:
subset_df = result[result.cluster == 13]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:24:34,938 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2024-02-15 23:24:40,896 - BERTopic - Embedding - Completed ✓
2024-02-15 23:24:40,901 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:24:45,343 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:24:45,351 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:24:45,367 - BERTopic - Cluster - Completed ✓
2024-02-15 23:24:45,375 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:24:45,395 - BERTopic - Representation - Completed ✓


Time taken for training: 10.477304935455322 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,155,-1_the_to_it_and,"[the, to, it, and, app, in, my, not, can, this]",[I just signed up fro code red in my area onli...


In [None]:
df_13 = result_subset_df.reset_index(drop = True)
df_13['cluster'] = 13
df_13

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Someone stole my email address for this game p...,1,Someone stole my email address,13
1,Disaster Alert,Login option is not working at all after the n...,1,Login option is not working,13
2,Disaster Alert,Freezing up. Not connecting. Cant get in. And ...,1,Cant get in. And may be posting false stuff,13
3,Disaster Alert,Well that was a bust.. wouldnt let me create a...,1,wouldnt let me create an account,13
4,Disaster Alert,honestly what a great purpose with such poor a...,2,login is lame,13
...,...,...,...,...,...
150,Hazards Red Cross,I need to change my email address and I can't ...,1,I need to change my email address and I can't,13
151,Hazards Red Cross,This is a great idea. However I've been trying...,3,it won't let me select the number I want to se...,13
152,National evacuation center guide,My family's wife has tried to use it between t...,2,couldn't be registered there,13
153,National evacuation center guide,"As the title implies, we tried to test the Osa...",1,I couldn't register it,13


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_13.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_13.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_13.to_excel(file_path_2, index=False)

## 4.14. Cluster 14

In [None]:
subset_df = result[result.cluster == 14]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:25:05,484 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2024-02-15 23:25:14,734 - BERTopic - Embedding - Completed ✓
2024-02-15 23:25:14,739 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:25:18,724 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:25:18,726 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:25:18,740 - BERTopic - Cluster - Completed ✓
2024-02-15 23:25:18,748 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:25:18,768 - BERTopic - Representation - Completed ✓


Time taken for training: 13.311889410018921 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,136,-1_gt_it_the_battery,"[gt, it, the, battery, to, and, is, you, of, app]",[Has given me time to safely evacuate from fir...


In [None]:
df_14 = result_subset_df.reset_index(drop = True)
df_14['cluster'] = 14
df_14

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Use up a lot of battery from the phone.,3,Use up a lot of battery from the phone,14
1,Disaster Alert,This app opens constantly and eats your battery,1,eats your battery,14
2,Disaster Alert,Fantastic app but a massive battery drain. Hav...,3,massive battery drain,14
3,Earthquake Alert!,Would be great if I could keep it on but it us...,1,it uses so much battery power,14
4,Earthquake Alert!,Great app doesn't take data battery etc and as...,5,doesn't take data battery,14
...,...,...,...,...,...
131,Emergency: Severe Weather App,We live in a very rural area and this app work...,4,easy on the battery,14
132,Emergency: Severe Weather App,Constant dings killing my battery how do I uni...,1,Constant dings killing my battery,14
133,National evacuation center guide,Please add permission to this App only. I want...,1,I want to avoid the slightest loss of batteries,14
134,National evacuation center guide,"I'm sure App is a practical and effective App,...",4,consumption power,14


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_14.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_14.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_14.to_excel(file_path_2, index=False)

## 4.15. Cluster 15

In [None]:
subset_df = result[result.cluster == 15]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:25:54,212 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2024-02-15 23:26:02,513 - BERTopic - Embedding - Completed ✓
2024-02-15 23:26:02,517 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:26:04,982 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:26:04,984 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:26:04,995 - BERTopic - Cluster - Completed ✓
2024-02-15 23:26:05,002 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:26:05,022 - BERTopic - Representation - Completed ✓


Time taken for training: 10.827600717544556 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,140,-1_the_it_to_app,"[the, it, to, app, gt, is, and, server, of, you]","[I bought Android 5.0 terminals, so I re-assem..."


In [None]:
df_15 = result_subset_df.reset_index(drop = True)
df_15['cluster'] = 15
df_15

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,All it shows is the world map with flooding fi...,1,not really worth your time.... Doesn't work on...,15
1,Earthquake Alert!,Alerts from alerter addon seems to have stoppe...,4,It wont connect,15
2,Earthquake Alert!,Downloaded the app. Great app but your free al...,5,doesn't connect to it's server,15
3,Earthquake Alert!,Very useful and informative Now stopped sendin...,3,Won't connect with the server,15
4,Earthquake Alert!,When wi fi wasnout received nitifications with...,5,nitifications with LTE,15
...,...,...,...,...,...
135,Hazards Near Me NSW,Older iPads and iphones can't use it as it can...,1,it can't connect to RFS server,15
136,Emergency: Severe Weather App,Tried to save my home for a permanent location...,1,PLEASE CHECK YOUR NETWORK CONNECTION,15
137,Emergency: Severe Weather App,"can't even allow location,worst app ever says ...",1,I've got no internet or wifi,15
138,Emergency: Severe Weather App,"App doesn't work anymore, keeps saying no inte...",1,no internet connection or can't find location,15


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_15.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_15.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_15.to_excel(file_path_2, index=False)

## 4.16. Cluster 16

In [None]:
subset_df = result[result.cluster == 16]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:27:13,552 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

2024-02-15 23:27:28,169 - BERTopic - Embedding - Completed ✓
2024-02-15 23:27:28,172 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:27:32,921 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:27:32,923 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:27:32,952 - BERTopic - Cluster - Completed ✓
2024-02-15 23:27:32,959 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:27:33,000 - BERTopic - Representation - Completed ✓


Time taken for training: 19.47483229637146 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,120,-1_the_to_app_in,"[the, to, app, in, satellite, it, is, map, and...",[Needs a link to open the RFS website. The app...
1,0,361,0_the_map_to_and,"[the, map, to, and, it, maps, is, app, of, on]",[Sorry This app has no data for the maps. The ...
2,1,86,1_the_to_and_in,"[the, to, and, in, app, it, of, earthquake, ma...",[I am updating my review. I live in Ridgecrest...


In [None]:
for rep in topic_info.Representation:
  print(rep,'\n')

['the', 'to', 'app', 'in', 'satellite', 'it', 'is', 'map', 'and', 'of'] 

['the', 'map', 'to', 'and', 'it', 'maps', 'is', 'app', 'of', 'on'] 

['the', 'to', 'and', 'in', 'app', 'it', 'of', 'earthquake', 'map', 'this'] 



In [None]:
df_16 = result_subset_df.reset_index(drop = True)
df_16['cluster'] = 16
df_16

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,The maps shown in alerts are totally worthless...,2,The maps shown in alerts are totally worthless,16
1,Disaster Alert,Really good alert to me alot better than anoth...,5,we miss out where's bout a map didn't tell me ...,16
2,Disaster Alert,When there is an alert I would like to be able...,4,I would like to be able to go right to the tex...,16
3,Disaster Alert,I Look world map +good ok.,2,I Look world map +good ok,16
4,Disaster Alert,Absolutely amazing graphics when zooming.,4,Absolutely amazing graphics when zooming.,16
...,...,...,...,...,...
562,Earthquake,I am an 82-year-old science teacher. I began u...,5,simple repeats of the maps no statistics,16
563,Earthquake,Gets its data from multiple sources. More deta...,4,More detailed map,16
564,Earthquake,Very sound app. It could use a bit more detail...,4,It could use a bit more detail on maps and in ...,16
565,Earthquake,My main complaint is that there isn't satellit...,3,there isn't satellite map or ocean bathymetry...,16


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_16.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_16.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_16.to_excel(file_path_2, index=False)

## 4.17. Cluster 17

In [None]:
subset_df = result[result.cluster == 17]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:28:14,551 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2024-02-15 23:28:23,409 - BERTopic - Embedding - Completed ✓
2024-02-15 23:28:23,413 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:28:25,882 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:28:25,884 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:28:25,897 - BERTopic - Cluster - Completed ✓
2024-02-15 23:28:25,904 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:28:25,925 - BERTopic - Representation - Completed ✓


Time taken for training: 11.401567935943604 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,139,-1_the_to_watch_and,"[the, to, watch, and, zone, it, app, in, for, is]",[This app now covers all the warnings I am int...


In [None]:
df_17 = result_subset_df.reset_index(drop = True)
df_17['cluster'] = 17
df_17

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,I LOVE YOU DISASTER ALERT! Thank you for havin...,5,Good to be not the only one to keep on the watch,17
1,Disaster Alert,Doesn't have a zone any where near where I live,1,Doesn't have a zone any where near where I live,17
2,Earthquake Alert!,Good app seems to work well on my phone. Only ...,5,change the time to my current time zone,17
3,Earthquake Alert!,Knowing someone is on watch makes my feel good,4,Knowing someone is on watch makes my feel good,17
4,Earthquake Alert!,Needs time display in local zone in addition t...,3,Needs time display in local zone in addition t...,17
...,...,...,...,...,...
134,Hazards Near Me NSW,Almost useless you cannot create watch zones. ...,1,unable to create watch zones over and over,17
135,Emergency: Severe Weather App,Not nearly as good as Tornado was. My phone is...,2,My phone is blowing up with heat warnings,17
136,Emergency: Severe Weather App,I’ve gotten phone calls at 2am warning of exce...,4,excessive heat index,17
137,Emergency: Severe Weather App,I got this because I wanted to be notified abo...,1,excessive heat warnings,17


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_17.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_17.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_17.to_excel(file_path_2, index=False)

## 4.18. Cluster 18

In [None]:
subset_df = result[result.cluster == 18]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:29:10,616 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

2024-02-15 23:29:29,112 - BERTopic - Embedding - Completed ✓
2024-02-15 23:29:29,118 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:29:36,290 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:29:36,292 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:29:36,319 - BERTopic - Cluster - Completed ✓
2024-02-15 23:29:36,327 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:29:36,364 - BERTopic - Representation - Completed ✓


Time taken for training: 25.768198490142822 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,536,-1_the_to_it_and,"[the, to, it, and, location, app, my, you, of,...",[I would give it a zero but that's not an opti...


In [None]:
df_18 = result_subset_df.reset_index(drop = True)
df_18['cluster'] = 18
df_18

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Too random... locations are not relevant,2,Too random... locations are not relevant,18
1,Disaster Alert,I wish I could pick specific countries to trac...,3,I wish I could pick specific countries,18
2,Disaster Alert,Disaster Alert - I wish I could set specific l...,4,I wish I could set specific locations,18
3,Disaster Alert,I kept getting notifications about every disas...,2,It would be better if the app used my location,18
4,Disaster Alert,In the hazard last and on the notification ban...,3,you can't always see the full name of the loca...,18
...,...,...,...,...,...
531,Earthquake,I would like to be able to select all the Amer...,3,I would like to be able to select all the Amer...,18
532,Earthquake,Pretty good but the Pro version for $1.99 is n...,3,Only monitors and pushes 2 locations,18
533,Earthquake,I have the payment version. “Alerts” arrive up...,1,it seems that Central America does not exist,18
534,Earthquake,"Informative. Great user interface, color and l...",5,allow the country and Island text names to be ...,18


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_18.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_18.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_18.to_excel(file_path_2, index=False)

## 4.19. Cluster 19

In [None]:
subset_df = result[result.cluster == 19]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:30:38,573 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2024-02-15 23:31:15,194 - BERTopic - Embedding - Completed ✓
2024-02-15 23:31:15,198 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:31:18,950 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:31:18,953 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:31:18,984 - BERTopic - Cluster - Completed ✓
2024-02-15 23:31:18,991 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:31:19,051 - BERTopic - Representation - Completed ✓


Time taken for training: 40.51512432098389 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24,-1_একট_با_زلزله_میشه,"[একট, با, زلزله, میشه, ولی, این, برنامه, اللہ,...",[৯৯৯ এপপস এর apk লিংটা দেওয়া যাবে ভাইয়া আমি এ...
1,0,260,0_the_to_it_and,"[the, to, it, and, this, is, for, in, app, of]",[Hi (*to Android dev*) I see a lot of people c...
2,1,215,1_de_la_en_que,"[de, la, en, que, muy, me, el, los, aplicación...",[Es excelente y me mantiene informada de lo qu...
3,2,183,2_die_nicht_ich_und,"[die, nicht, ich, und, app, der, ist, das, es,...",[Nina ist bestimmt ganz toll wenn es mal brenn...


In [None]:
subset_df = result_subset_df[result_subset_df.cluster.isin([0])].reset_index(drop = True)
topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:32:10,105 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-02-15 23:32:23,204 - BERTopic - Embedding - Completed ✓
2024-02-15 23:32:23,208 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:32:26,823 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:32:26,831 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:32:26,851 - BERTopic - Cluster - Completed ✓
2024-02-15 23:32:26,863 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:32:26,909 - BERTopic - Representation - Completed ✓


Time taken for training: 16.838040828704834 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,45,-1_mm_persian_تطبيق_mmmmmm,"[mm, persian, تطبيق, mmmmmm, app, dagan, aap, ...",[hy uhg x c.3 is a v vhhy y big v v GB nnn v. ...
1,0,145,0_the_this_app_to,"[the, this, app, to, and, for, it, is, of, in]",[The app is great EXCEPT for the fact I cannot...
2,1,70,1_the_in_language_english,"[the, in, language, english, to, it, app, germ...",[I thing the whole of Switzerland. If yfranz a...


In [None]:
df_19 = result_subset_df[result_subset_df.cluster.isin([1])].reset_index(drop = True)
df_19['cluster'] = 19
df_19

Unnamed: 0,app,content,score,content_short,cluster
0,Earthquake Alert!,It's a great app.. However it would be nice to...,5,spelling,19
1,Earthquake Alert!,Please input local language on the map .,4,local language,19
2,Earthquake Alert!,How do i change time zone? I want it to be PST,3,PST,19
3,FEMA,Thank you. We'll written instructions. Needed ...,5,Easy to translate,19
4,FEMA,I opened the app and it is all in Spanish and ...,2,we don't speak Spanish,19
...,...,...,...,...,...
65,Alertswiss,"Currently, I like the app very much, but you s...",4,change the language,19
66,Alertswiss,"All in all, a very good app, but would be grea...",5,"a very good app, but would be great if you cou...",19
67,Alertswiss,Please also use the app in German and not only...,2,Please also use the app in German and not only...,19
68,Alertswiss,"If the device language is set to français, the...",2,add an app-language setting,19


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_19.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_19.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_19.to_excel(file_path_2, index=False)

## 4.20. Cluster 20

In [None]:
subset_df = result[result.cluster == 20]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:34:09,699 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2024-02-15 23:34:23,544 - BERTopic - Embedding - Completed ✓
2024-02-15 23:34:23,550 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:34:27,122 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:34:27,125 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:34:27,145 - BERTopic - Cluster - Completed ✓
2024-02-15 23:34:27,153 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:34:27,183 - BERTopic - Representation - Completed ✓


Time taken for training: 17.537178993225098 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,75,-1_the_it_to_fix,"[the, it, to, fix, of, have, but, and, you, pl...",[I haven't had a problem with this app before ...
1,0,114,0_it_app_the_fix,"[it, app, the, fix, this, and, to, for, please...",[This used to be an awesome app. Now all it do...
2,1,96,1_to_the_in_be,"[to, the, in, be, you, and, it, this, app, where]",[When I go and check the alerts page there is ...


In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

["I haven't had a problem with this app before & I've been very happy with it (5) but the last 5 days I haven't been able to scroll down in watch zone edit settings or refresh. Pls fix the problem. I can c a cpl other comments have this same issue. I'll re rate the app once it's fixed & usable again. Updated Review 13/11 All issues I was experiencing have been fixed. Keep up the great work. Re rating the app to 5", "Has great information & updates very quickly.  It has an out of memory problem  & defaults the settings & filter.Needs to be able to get rid of the information on the map when you tap the map. It goes away but as soon as you move the map it pops up again. Kind of irritating. I HAVE a GALAXY S8+ & A Galaxy Tab 4. Both have out of memory fault. Both have 32Gb of menory. I shouldn't have this problem. Please find out why & let us know how to fix it or how you will fix it.", "It's a pleasure to have a quick announcement, but it's a pleasure to be notified from the earthquake le

In [None]:
df_20 = result_subset_df.reset_index(drop = True)
df_20['cluster'] = 20
df_20

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,7-6-2022 You have not posted one not one earth...,1,fix,20
1,Disaster Alert,Always crashes when I open a disaster I subscr...,3,Can you please fix this issue,20
2,Disaster Alert,Since the last update the app crashes when you...,2,Please fix,20
3,Disaster Alert,Excellent app nice graphics and design. Good c...,5,promised a fix,20
4,Disaster Alert,Love this app. Lets u know about every environ...,4,resolved my issue,20
...,...,...,...,...,...
280,Alertswiss,"In today's siren test, the app did not even wo...",2,Please ask for a quick solution to solve the p...,20
281,Alertswiss,I'm in Coppet (Vaud) and when I post the notif...,4,A problem,20
282,Alertswiss,Would have expected the siren test to be annou...,1,Current issues,20
283,Earthquake,I have had no issues with the program,5,I have had no issues,20


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_20.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_20.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_20.to_excel(file_path_2, index=False)

## 4.21. Cluster 21

In [None]:
subset_df = result[result.cluster == 21]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:36:54,076 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

2024-02-15 23:37:12,997 - BERTopic - Embedding - Completed ✓
2024-02-15 23:37:13,001 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:37:16,229 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:37:16,231 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:37:16,258 - BERTopic - Cluster - Completed ✓
2024-02-15 23:37:16,265 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:37:16,302 - BERTopic - Representation - Completed ✓


Time taken for training: 22.266236782073975 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,110,-1_the_it_to_and,"[the, it, to, and, is, for, app, this, of, you]",[I paid for the Pro... and nothing special abo...
1,0,310,0_the_ads_app_to,"[the, ads, app, to, it, and, this, for, is, but]",[I used to have this app it was excellant and ...
2,1,122,1_the_it_to_and,"[the, it, to, and, this, app, you, of, for, in]",[I haven't received notice since I've set it t...


In [None]:
for rep in topic_info.Representative_Docs:
    print(rep,'\n')

['I paid for the Pro... and nothing special about it. The app doesn’t warn you right away... it’s got like a 10-15 min wait time before it pops up.  Living in Alaska, this app is worthless to me. I use another app, that was 100% free. And literally let’s me know when we have an earthquake, at the time of the earthquake.   Don’t waste your money, because I honestly feel like I was robbed out of my $2.99. This app is seriously worthless.', "This app is great because it shows the track of storms and the date and time it is predicted to arrive at each location. It also gives the percentage of probability that each storm will develop into a hurricane. It's helpful that you can choose to receive alerts for storms in either the Atlantic or the Pacific or both. The app is reliable stable and the ads are unobtrusive.", "I had this app a few years ago and let it expire because it was pricey according to others that do the same thing. I decided to reinstall it today and it says it is free and you

In [None]:
df_21 = result_subset_df.reset_index(drop = True)
df_21['cluster'] = 21
df_21

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,there's nothing there. just a map and a blank ...,1,waste of time,21
1,Disaster Alert,Excellent best to the point very pro,5,pro,21
2,Disaster Alert,After last update app immediately shuts down i...,1,Worthless utterly worthless,21
3,Disaster Alert,Not Happy With The App. Waste of Money.,1,Waste of Money,21
4,Disaster Alert,I used to love this app since I live and work ...,1,they now want you to pay,21
...,...,...,...,...,...
537,Earthquake,They charge without asking! And I find it expe...,1,They charge without asking,21
538,Earthquake,Updated and surprised I was thanked for the pu...,5,surprised I was thanked for the purchase,21
539,Earthquake,The free or free version has any appeal or som...,2,if in the free version you have nothing nice,21
540,Earthquake,I’s Mmm I’m mmhj im so happy I could do do it ...,5,pop o out or mtg t up,21


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_21.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_21.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_21.to_excel(file_path_2, index=False)

## 4.22. Cluster 22

In [None]:
subset_df = result[result.cluster == 22]

topic_info, result_subset_df = run_topic_modeling(subset_df, 'content', 50)

2024-02-15 23:39:31,287 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-02-15 23:39:41,414 - BERTopic - Embedding - Completed ✓
2024-02-15 23:39:41,422 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-15 23:39:43,868 - BERTopic - Dimensionality - Completed ✓
2024-02-15 23:39:43,874 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-15 23:39:43,889 - BERTopic - Cluster - Completed ✓
2024-02-15 23:39:43,900 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-15 23:39:43,931 - BERTopic - Representation - Completed ✓


Time taken for training: 12.66842794418335 seconds


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,105,-1_the_and_app_it,"[the, and, app, it, to, reliable, is, not, thi...",[The app is very reliable and trustworthy. I d...


In [None]:
df_22 = result_subset_df.reset_index(drop = True)
df_22['cluster'] = 22
df_22

Unnamed: 0,app,content,score,content_short,cluster
0,Earthquake Alert!,Excellent app. Used this for years and find it...,5,reliable,22
1,Earthquake Alert!,... doesn't show earth quakes in santa cruze d...,1,it seems to be more reliable,22
2,Earthquake Alert!,Simple but tons of features. Quake info is up ...,5,RELIABLE AND HELPFUL,22
3,Earthquake Alert!,IT HAS BEEN DEPENDABLE AND HAS GIVEN ME GOOD S...,5,IT HAS BEEN DEPENDABLE,22
4,Earthquake Alert!,Easy to use. Seems pretty reliable.,5,Seems pretty reliable,22
...,...,...,...,...,...
100,National evacuation center guide,It depends on the reliability of specifying a ...,2,reliability,22
101,Earthquake,I’ve been using this site for personal researc...,4,reliable and most useful,22
102,Earthquake,I thought I liked this app. But just now I rea...,1,Unreliable,22
103,Earthquake,Living in an area that gets a few earthquakes ...,5,consistent and reliable,22


In [None]:
save_dir = '../dataset/phase 4/partition/'
file_path_1 = os.path.join(save_dir, 'topic_info_cluster_22.xlsx')
file_path_2 = os.path.join(save_dir, 'result_cluster_22.xlsx')

topic_info.to_excel(file_path_1, index=False)
df_22.to_excel(file_path_2, index=False)

# 5. Combine All Refined Cluster Dataset

In [None]:
import pandas as pd

# Base path for the files
base_path = '../dataset/phase 4/partition/'

# Generate file paths dynamically for IDs from 1 to 17
file_paths = [f'{base_path}result_cluster_{i}.xlsx' for i in range(1, 23)]

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the file paths, read each file as a DataFrame, and append it to the list
for file_path in file_paths:
    df = pd.read_excel(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into one final DataFrame
df_combined = pd.concat(dfs, ignore_index=True)


In [None]:
df_combined

Unnamed: 0,app,content,score,content_short,cluster
0,Disaster Alert,Working as a Public Health Nurse I get to resp...,5,I may have lost and/or procured gadgets to aid...,1
1,Disaster Alert,It's a good app but not so useful. I was expec...,3,not so useful,1
2,Disaster Alert,I love natural disasters they're so fascinatin...,5,autism,1
3,Disaster Alert,Thanks to the creator of this app..it was very...,5,..,1
4,Disaster Alert,Very accurate and very useful. More power God ...,5,Very accurate and very useful,1
...,...,...,...,...,...
14957,National evacuation center guide,It depends on the reliability of specifying a ...,2,reliability,22
14958,Earthquake,I’ve been using this site for personal researc...,4,reliable and most useful,22
14959,Earthquake,I thought I liked this app. But just now I rea...,1,Unreliable,22
14960,Earthquake,Living in an area that gets a few earthquakes ...,5,consistent and reliable,22


# 6. Sentence Embedding

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import time

# Load the "all-MiniLM-L6-v2" model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Assuming df['review'] contains your text data
reviews = df_combined['content'].tolist()

# Define batch size
batch_size = 500

# Placeholder for accumulated embeddings
all_embeddings = []

# Process data in batches
for i in range(0, len(reviews), batch_size):
    batch = reviews[i:i+batch_size]
    start_time = time.time()
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    end_time = time.time()

    # Append the embeddings of the current batch
    all_embeddings.append(batch_embeddings)

    print(f"Batch {i//batch_size + 1} processed in {end_time - start_time:.2f} seconds.")

# Concatenate all batch embeddings into a single array
all_embeddings = np.vstack(all_embeddings)

# Save the embeddings to a .npy file
save_dir = '../dataset/phase 4/review_embeddings.npy'
np.save(save_dir, all_embeddings)


Batch 1 processed in 6.03 seconds.
Batch 2 processed in 3.43 seconds.
Batch 3 processed in 12.02 seconds.
Batch 4 processed in 14.02 seconds.
Batch 5 processed in 14.85 seconds.
Batch 6 processed in 12.12 seconds.
Batch 7 processed in 7.29 seconds.
Batch 8 processed in 6.33 seconds.
Batch 9 processed in 8.28 seconds.
Batch 10 processed in 5.45 seconds.
Batch 11 processed in 9.70 seconds.
Batch 12 processed in 14.35 seconds.
Batch 13 processed in 12.89 seconds.
Batch 14 processed in 9.75 seconds.
Batch 15 processed in 7.74 seconds.
Batch 16 processed in 6.92 seconds.
Batch 17 processed in 9.06 seconds.
Batch 18 processed in 18.13 seconds.
Batch 19 processed in 13.73 seconds.
Batch 20 processed in 13.62 seconds.
Batch 21 processed in 12.16 seconds.
Batch 22 processed in 11.76 seconds.
Batch 23 processed in 11.56 seconds.
Batch 24 processed in 13.47 seconds.
Batch 25 processed in 11.44 seconds.
Batch 26 processed in 12.92 seconds.
Batch 27 processed in 13.29 seconds.
Batch 28 processed in

In [None]:
import numpy as np

base_dir = '../dataset/phase 4/review_embeddings.npy'
all_embeddings = np.load(base_dir)
all_embeddings[0:5]

array([[-0.00599286, -0.0269204 ,  0.06160264, ..., -0.02356998,
        -0.06433377, -0.04103687],
       [ 0.01528736, -0.02138874,  0.09498901, ..., -0.0001806 ,
        -0.08165931,  0.07255717],
       [ 0.01375277, -0.05492403,  0.14241037, ...,  0.07704751,
        -0.06400198, -0.02123824],
       [ 0.0246782 ,  0.01142239,  0.0997716 , ...,  0.06356935,
        -0.07103895,  0.02681897],
       [-0.07149693,  0.0870933 , -0.07307293, ...,  0.0343116 ,
        -0.04620685,  0.00310423]], dtype=float32)

# 7. Distance Calculation

In [None]:
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

# Initialize the 'distance' column to NaN or 0 to ensure it's there
df_combined['distance_to_its_centroid'] = np.nan

for cluster_num in range(1, 23):  #
    # Find indices for rows in 'result' where 'cluster' == cluster_num
    indices = df_combined[df_combined['cluster'] == cluster_num].index

    # Extract embeddings for these indices
    embeddings_cluster = all_embeddings[indices]

    # Perform KMeans clustering with n_clusters=1 to find the centroid for the current cluster
    kmeans = KMeans(n_clusters=1, random_state=42).fit(embeddings_cluster)
    centroid = kmeans.cluster_centers_[0]

    # Calculate distance from each document's embedding in the current cluster to its centroid
    distances = cdist(embeddings_cluster, [centroid], 'euclidean').flatten()

    # Update the 'distance' column for rows belonging to the current cluster
    df_combined.loc[indices, 'distance_to_its_centroid'] = distances


In [None]:
df_combined = df_combined.sort_values(by=['cluster', 'distance_to_its_centroid'], ascending=[True, True]).reset_index(drop = True)

In [None]:
df_combined

Unnamed: 0,app,content,score,content_short,cluster,distance_to_its_centroid
0,global storms,Great app. Always turn to it for accurate info...,5,accurate information,1,0.647051
1,Tropical Hurricane Tracker,Excellent app...lots of information in one place.,5,lots of information,1,0.650179
2,My Earthquake Alerts - Map,Great app easy to use good information.,5,Great app easy to use good information,1,0.655233
3,Earthquake Alert!,great app. easy to use and good information.,5,great app. easy to use and good information,1,0.658061
4,global storms,This is a wonderful app that everyone should h...,5,Very accurate and up to the minute with pertin...,1,0.658440
...,...,...,...,...,...,...
14957,Yurekuru Call,There was no response during an emergency eart...,3,It's a non-reliable plane,22,0.965684
14958,VicEmergency,Inaccurate slow woeful total waste of download...,1,Inaccurate slow woeful,22,1.001129
14959,My Earthquake Alerts - Map,Been faithful so far. Dependable.,5,Dependable,22,1.048918
14960,Wind Map Hurricane Tracker 3D,I love watching this! I am certainly not an ex...,5,not an expert,22,1.058714


In [1]:
for review in df_combined[df_combined.cluster == 1].head(50)['content']:
    print('\n', review)

In [None]:
save_dir = '../dataset/phase 4/'
file_path = os.path.join(save_dir, 'topic_modelling_result.xlsx')

df_combined.to_excel(file_path, index=False)

# 8. Counting Total Reviews Based on Cluster

In [2]:
import pandas as pd
import time
import os

base_dir = '../dataset/phase 4/'
file_path = os.path.join(base_dir, 'topic_modelling_result.xlsx')

df = pd.read_excel(file_path)

In [6]:
df.groupby('cluster').count().reset_index()[['cluster', 'app']].to_clipboard(index=False)