# 1. Read Dataset

In [41]:
import pandas as pd
import os

base_dir = '../dataset/phase 2/'
file_path_1 = os.path.join(base_dir, 'df_translated_google_play.csv')
file_path_2 = os.path.join(base_dir, 'df_translated_app_store.csv')

df_1 = pd.read_csv(file_path_1)
df_2 = pd.read_csv(file_path_2)

# 2. Combine Dataset From Play Store and App Store

## 2.1. App Name Matching

In [42]:
df_1['app'] = df_1['app'].replace('Hazards - Red Cross', 'Hazards Red Cross')

df_2['app'] = df_2['app'].replace('GeoNet Quake', 'GeoNet')
df_2['app'] = df_2['app'].replace('Hazards – Red Cross', 'Hazards Red Cross')
df_2['app'] = df_2['app'].replace('Disaster Alert (PDC Global)', 'Disaster Alert')
df_2['app'] = df_2['app'].replace('Earthquake+ Alerts Map & Info', 'Earthquake + Alerts Map & Info')

## 2.2. Concat Play store and App Store dataset

In [43]:
df = pd.concat([df_1[['app', 'content', 'score']], df_2[['app', 'content', 'score']]], ignore_index = True).reset_index(drop = True)

# 3. Filter Natural Disaster App Dataset

In [44]:
natural_disaster_app_name = ['Earthquake Alert!', 'My Earthquake Alerts - Map', 'Earthquakes Tracker', 'Yurekuru Call', 
                        'Wind Map Hurricane Tracker 3D', 'global storms', 'FEMA',
                        'Volcanoes & Earthquakes', 'Hazards Near Me NSW', 'Disaster Alert',
                        'Tropical Hurricane Tracker', 'GeoNet', 'My Hurricane Tracker & Alerts',
                        'Emergency: Severe Weather App', 'Hurricane Tracker', 'Hazards Red Cross',
                        'NINA - Die Warn-App des BBK', 'SeaStorm Hurricane Tracker', 'National evacuation center guide',
                        'My Hurricane Tracker Pro', 'Alert SA', 'Floods Near Me NSW', 'Safety tips',
                        'Earthquake', 'Earthquake + Alerts Map & Info', 'Natural Disaster Monitor',
                        'Earthquakes Today', 'FloodAlert Waterlevel Alerts', 'NERV Disaster Prevention', 
                        'SES Assistance QLD', 'Hurricane & Typhoon Track',
                        'QuakeFeed Earthquake Tracker', 'LastQuake', 'VIC Fires', 'PREP'
                        '112 India', 'VicEmergency', 'CodeRED Mobile Alert', 'myAlerts', 'SD Emergency',
                        'Emergency', 'Alertswiss', 'Alert2Me - Emergency Alerts', 'BD 999',
                        'KwiKam (Quicking Services)', 'Emergency Ready App', 'Anhaar'
                       ]
df = df[df.app.isin(natural_disaster_app_name)]

# 4. Data Cleansing

## 4.1. Remove Data Duplicate

In [45]:
df = df.drop_duplicates(subset=['content', 'app'])

## 4.2. Remove Empty Review

In [46]:
df = df.dropna(subset=['content'])

## 4.3. Remove Short Review

In [47]:
df = df.assign(word_count=lambda x: x['content'].apply(lambda text: len(str(text).split()))).query('word_count > 4')
df = df[['app', 'content', 'score']]

## 4.4. Remove Zero Rating

In [48]:
df = df[df.score != 0].reset_index(drop = True)

## 4.5. Remove Emoji

In [49]:
import emoji
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U0001F300-\U0001F5FF"  # Various Asian characters
                               u"\U00002500-\U00002BEF"  # various technical, arrows, geometric and drawing
                               u"\U0001F926-\U0001F937"  # Supplemental Symbols and Pictographs
                               u"\U00010000-\U0010FFFF"  # Other additional symbols
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['content'] = df['content'].apply(remove_emoji)

# 5. Shorten User Review

In [50]:
import time
import warnings
from transformers import pipeline
warnings.filterwarnings('ignore')

# Initialize the model and tokenizer once
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline('question-answering', model=model_name, tokenizer=model_name)

def qna(review, qa_pipeline):
    question = "What is the user's complaint or suggestion about the natural disaster emergency app?"
    
    # Get the answer
    answer = qa_pipeline({
        'context': review,
        'question': question
    })
    return answer['answer']

result = []
index = 0
total_time = 0  # Initialize total time

for review in df['content']:
    start_time = time.time()  # Start time measurement

    # Call the qna function with the pipeline
    answer = qna(review, qa_pipeline)
    result.append(answer)

    end_time = time.time()  # End time measurement
    total_time += end_time - start_time  # Accumulate the total time

    index += 1
    if index % 500 == 0:
        print(f"Processed {index} reviews in {total_time} seconds")
        total_time = 0  


  from .autonotebook import tqdm as notebook_tqdm


Processed 500 reviews in 186.42636132240295 seconds
Processed 1000 reviews in 160.45618295669556 seconds
Processed 1500 reviews in 170.79099416732788 seconds
Processed 2000 reviews in 172.19620084762573 seconds
Processed 2500 reviews in 159.82256984710693 seconds
Processed 3000 reviews in 154.8026683330536 seconds
Processed 3500 reviews in 159.95489287376404 seconds
Processed 4000 reviews in 156.24324226379395 seconds
Processed 4500 reviews in 171.52524876594543 seconds
Processed 5000 reviews in 172.09528970718384 seconds
Processed 5500 reviews in 169.3285629749298 seconds
Processed 6000 reviews in 154.22064661979675 seconds
Processed 6500 reviews in 199.30241012573242 seconds
Processed 7000 reviews in 202.18596029281616 seconds
Processed 7500 reviews in 180.31750965118408 seconds
Processed 8000 reviews in 159.18331146240234 seconds
Processed 8500 reviews in 159.2018096446991 seconds
Processed 9000 reviews in 157.96987962722778 seconds
Processed 9500 reviews in 165.9948661327362 second

In [52]:
df['content_short'] = result

In [53]:
df

Unnamed: 0,app,content,score,content_short
0,Disaster Alert,Working as a Public Health Nurse I get to resp...,5,I may have lost and/or procured gadgets to aid...
1,Disaster Alert,Nice to have before traveling to unknown terri...,5,Nice to have before traveling to unknown terri...
2,Disaster Alert,I like! I'm trying to find anything about tsun...,5,tsunami
3,Disaster Alert,good to have but what options are expected in ...,5,good to have
4,Disaster Alert,Shows hazards all right but refuses to send no...,2,Useless to me without notifications
...,...,...,...,...
28156,Earthquake,It works quite well even anticipates some othe...,5,works quite well
28157,Earthquake,This application is very good.,5,This application is very good
28158,Earthquake,Data from earthquakes in Chile in the last 24 ...,2,the application is not updating the telluric a...
28159,Earthquake,This is as good as earthquake apps can go. Thi...,5,detailed info on many earthquakes that even ot...


# 6. Save Dataset

In [2]:
save_dir = '../dataset/phase_3'
output_path = os.path.join(save_dir, 'topic_modelling_dataset.csv')

df.to_csv(output_path, index = False)