<a href="https://colab.research.google.com/github/Ricco48/Multilabel-Sentiment-and-Emotion-Dataset-from-Indonesian-Mobile-Application-Review/blob/main/Data%20Scrapping%20Code/Data_Scrapping_for_GitHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Connect to G-Drive**

In [None]:
# Connect To G-Drive
#===================
from google.colab import drive
drive.mount('/content/drive')

# **Set Package & Libs.**

In [None]:
# Install Required Libs / Package
#================================
!pip install google-play-scraper         # Get Google Play Scraper Package

In [None]:
# Import Required Libs / Package
#===============================

# Package for Data Pre-processing
import json
import pandas as pd
from tqdm import tqdm
import csv
import datetime
import html
import re
import string
import numpy as np

# Package for Data Plotting
import matplotlib.pyplot as plt
import seaborn as sns
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

# Package for Data Scrapping
from google_play_scraper import app, Sort, reviews_all, reviews

print("Process Completed!")

Process Completed!


# **Set-up Apps. URLs**

In [None]:
# Set Apps. IDs from URLs
#========================
apps_package = ['mobileAppID1',      # Get mobile app ID from URLs and set into array
                'mobileAppID2',      # You can get the URLs from official download page
                'mobileAppID3']      # i.e. Google PlayStore / IOS AppStore

# Example of Mobile App ID:
# https://blablakwndklawnd.awdscs.awdaw/awdascafew/awdawdawd/awdsvdfgzrgseg/details?id=awdawdkakwdkawkdaw&hl=en&awdas44
# Take this part for Mobile App ID >> awdawdkakwdkawkdaw ==============================/\ /\ /\ /\ /\ /\
#                                                                                      || || || || || ||


# **Data Scrapping**

### **Get Apps. Information**

In [None]:
# Scrap Apps. Information
#========================
apps_info = [] # Used to store apps. information

for apps in tqdm(apps_package):
  info = app(apps, lang = 'id', country = 'id') # Set boundaries for language and apps. country
  del info['comments']                          # Delete un-used data column
  apps_info.append(info)                        # Append apps. information into a single container

In [None]:
# Set Function To Print JSON in Better Format
#============================================
def print_json(json_object):
  json_str = json.dumps(json_object,
                        indent = 2,
                        sort_keys = True,
                        default = str)
  print(highlight(json_str, JsonLexer(), TerminalFormatter()))

# Test Call
#==========
print_json(apps_info)

In [None]:
# Set Function To Print Apps. Icon
#=================================
def format_title(title):
  sep_index = title.find(':') if title.find(':') != -1 else title.find('-')
  if sep_index != -1:
    title = title[:sep_index]
  return title[:10]

# Set Icon Size & Position
#=========================
fig, axs = plt.subplots(2, len(apps_info) // 2, figsize=(14, 5))

# Run Function
#=============
for i, ax in enumerate(axs.flat):
  ai = apps_info[i]
  img = plt.imread(ai['icon'])
  ax.imshow(img)
  ax.set_title(format_title(ai['title']))
  ax.axis('off')

### **Set Apps. Information To Dataframe**

In [None]:
# Convert Into Pandas Dataframe
#==============================
app_infos_df = pd.DataFrame(apps_info)

# Test Call
#==========
app_infos_df

### **Scrapping Reviews**

In [None]:
apps_reviews = [] # Used to store apps. reviews

for app in tqdm(apps_package):
  for score in list(range(1, 6)):                                       # Boundaries to make sure the data has rating level for mobile app
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:                # Sort data to get the newest and most relevant
      revs, _ = reviews(app,
                        lang = 'id',                                    # Boundaries for apps. reviews language in Indonesia
                        country = 'id',                                 # Boundaries for apps. reviews country from Indonesia only
                        sort = sort_order,                              # Sort data base on review score
                        count = 300 if score == 3 else 600,             # Set minimum data to be collected with specific score requirements for every page
                        filter_score_with = score)

      for rev in revs:
        rev['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        rev['appId'] = np.apply_over_axes
      apps_reviews.extend(revs)

In [None]:
# Test Call
#==========
print_json(apps_reviews[0])

In [None]:
# Get The Total Apps. Review
#=======================
len(apps_reviews)

### **Set Apps. Reviews To Dataframe**

In [None]:
# Set Into Pandas Dataframe
#==========================
app_reviews_df = pd.DataFrame(apps_reviews)

In [None]:
# Test Call
#==========
app_reviews_df.head()

In [None]:
# Get Only The Apps. Reviews & Scores
#====================================
appsReviewsDataframe = app_reviews_df.loc[:,["content","score"]]

In [None]:
# Test Call
#==========
appsReviewsDataframe

In [None]:
# Check Total Data Base On Score
#===============================
print(appsReviewsDataframe.count(), "\n")
print("Data Distribution:\n==================")
print(appsReviewsDataframe.score.value_counts().sort_index())

# **Data Cleaning**

In [None]:
# Remove Duplicates Data
#=======================
newRevDF = appsReviewsDataframe.drop_duplicates('content', keep = 'first')

# Reset Dataframe Index
#======================
newRevDF = newRevDF.reset_index(drop = True)

# Remove URLs
#============
newRevDF['content'] = newRevDF['content'].apply(lambda x: re.sub(r'http\S+\s*','', str(x)))

# Remove Mention, Hastag, & Special Character
#============================================
newRevDF['content'] = newRevDF['content'].apply(lambda x: re.sub(r'@[a-zA-Z_0-9]*','', str(x)))                    # User Mention
newRevDF['content'] = newRevDF['content'].apply(lambda x: re.sub(r'#[a-zA-Z_0-9]*','', str(x)))                    # Hastga
newRevDF['content'] = newRevDF['content'].apply(lambda x: re.sub(r'\[UNAME\]','', str(x), flags = re.IGNORECASE))  # Spec. Char.

# Remove Emoji
#=============
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"                 # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

newRevDF['content'] = newRevDF['content'].apply(lambda x: remove_emoji(x))

# Remove Dupes
#=============
newRevDF['content'] = newRevDF['content'].apply(lambda x: re.sub(r'(.{1,})\1{2,}',r'\1\1', str(x)))

# Remove "\n" Or ENTER
#=====================
newRevDF['content'] = newRevDF['content'].apply(lambda x: x.replace('\n', ' '))

# Test Call
#==========
newRevDF

In [None]:
# Check Total Data After Cleaning Process
#========================================
print(newRevDF.count(), "\n")
print("Data Distribution:\n==================")
print(newRevDF.score.value_counts().sort_index())

# **Split Data Based On Score**

In [None]:
# Split Data Process
#===================
df_Score1 = newRevDF[newRevDF["score"]==1]
df_Score2 = newRevDF[newRevDF["score"]==2]
df_Score3 = newRevDF[newRevDF["score"]==3]
df_Score4 = newRevDF[newRevDF["score"]==4]
df_Score5 = newRevDF[newRevDF["score"]==5]

In [None]:
# Test Call
#==========
print(df_Score1.head())
print("\n====================================\n")
print(df_Score2.head())
print("\n====================================\n")
print(df_Score3.head())
print("\n====================================\n")
print(df_Score4.head())
print("\n====================================\n")
print(df_Score5.head())

# **Save To CSV File**

In [None]:
df_Score1.to_csv('data\save\location\data_1',
                 sep = '\t',
                 index = None,
                 header = True)

df_Score2.to_csv('data\save\location\data_2',
                 sep = '\t',
                 index = None,
                 header = True)

df_Score3.to_csv('data\save\location\data_3',
                 sep = '\t',
                 index = None,
                 header = True)

df_Score4.to_csv('data\save\location\data_4',
                 sep = '\t',
                 index = None,
                 header = True)

df_Score5.to_csv('data\save\location\data_5',
                 sep = '\t',
                 index = None,
                 header = True)