# Create a Dataset for Sentiment Analysis

In [None]:
!pip install -qq google-play-scraper

In [None]:
!pip install -qq -U watermark

In [None]:
%reload_ext watermark
%watermark -v -p pandas,matplotlib,seaborn,google_play_scraper

CPython 3.6.9
IPython 5.5.0

pandas 1.1.5
matplotlib 3.2.2
seaborn 0.11.0
google_play_scraper 0.1.2


In [None]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

In [None]:
app_packages = [
  'com.miHoYo.GenshinImpact',
]

## Scraping App Information

In [None]:
app_infos = []

for ap in tqdm(app_packages):
  info = app(ap, lang='en', country='us')
  del info['comments']
  app_infos.append(info)

100%|██████████| 1/1 [00:00<00:00,  7.16it/s]


In [None]:
def print_json(json_object):
  json_str = json.dumps(
    json_object, 
    indent=2, 
    sort_keys=True, 
    default=str
  )
  print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [None]:
print_json(app_infos[0])

{
  [94m"adSupported"[39;49;00m: [34mnull[39;49;00m,
  [94m"androidVersion"[39;49;00m: [33m"5.0"[39;49;00m,
  [94m"androidVersionText"[39;49;00m: [33m"5.0 and up"[39;49;00m,
  [94m"appId"[39;49;00m: [33m"com.miHoYo.GenshinImpact"[39;49;00m,
  [94m"containsAds"[39;49;00m: [34mfalse[39;49;00m,
  [94m"contentRating"[39;49;00m: [33m"Teen"[39;49;00m,
  [94m"contentRatingDescription"[39;49;00m: [33m"Alcohol Reference, Fantasy Violence"[39;49;00m,
  [94m"currency"[39;49;00m: [33m"USD"[39;49;00m,
  [94m"description"[39;49;00m: [33m"Step into Teyvat, a vast world teeming with life and flowing with elemental energy.\r\n\r\nYou and your sibling arrived here from another world. Separated by an unknown god, stripped of your powers, and cast into a deep slumber, you now awake to a world very different from when you first arrived.\r\n\r\nThus begins your journey across Teyvat to seek answers from The Seven \u2014 the gods of each element. Along the way, prepare to e

In [None]:
def format_title(title):
  sep_index = title.find(':') if title.find(':') != -1 else title.find('-')
  if sep_index != -1:
    title = title[:sep_index]
  return title[:10]

fig, axs = plt.subplots(2, len(app_infos) // 2, figsize=(14, 5))

for i, ax in enumerate(axs.flat):
  ai = app_infos[i]
  img = plt.imread(ai['icon'])
  ax.imshow(img)
  ax.set_title(format_title(ai['title']))
  ax.axis('off')

<Figure size 1008x360 with 0 Axes>

In [None]:
app_infos_df = pd.DataFrame(app_infos)
app_infos_df.to_csv('apps.csv', index=None, header=True)

## Scraping App Reviews

In [None]:
app_reviews = []

for ap in tqdm(app_packages):
  for score in list(range(1, 6)):
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        lang='en',
        country='us',
        sort=sort_order,
        count= 2000 if score == 3 else 1000,
        filter_score_with=score
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)

100%|██████████| 1/1 [02:02<00:00, 122.78s/it]


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
print_json(app_reviews[0])

{
  [94m"appId"[39;49;00m: [33m"com.miHoYo.GenshinImpact"[39;49;00m,
  [94m"at"[39;49;00m: [33m"2021-01-03 05:31:39"[39;49;00m,
  [94m"content"[39;49;00m: [33m"The game is great. It's fun, unique, and the graphics are pretty impressive. The only issue is the lag I'm getting. I've tried reinstalling, playing with the graphic settings, everything but nothing changes the common freezing and severe frame rate issues I'm receiving on my Google pixel 3 XL. I want so bad for this to run smoothly. Will watch closely for updates. (Edit) several months have passed and so far no fix for any of this, nor have I seen anything hinting that they will work on it."[39;49;00m,
  [94m"repliedAt"[39;49;00m: [34mnull[39;49;00m,
  [94m"replyContent"[39;49;00m: [34mnull[39;49;00m,
  [94m"reviewCreatedVersion"[39;49;00m: [33m"1.2.0_1565149_1627898"[39;49;00m,
  [94m"reviewId"[39;49;00m: [33m"gp:AOqpTOH_TJ35eN5SWpoOjKnyax0-5olqmu6AAa9aTdoeabbifZyZbRvsmFJ4-vC92Stk_5MWWfKTskqyVeaY5A"[

In [None]:
len(app_reviews)

53974

to a CSV file:

In [None]:
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.to_csv('reviews.csv', index=None, header=True)