This notebook scapes app reviews from Google play store and persists training, validation & test dataset for fine-tuning BERT model for sentiment classification using user app ratings.

In [1]:
! pip install google_play_scraper

Collecting google_play_scraper
[?25l  Downloading https://files.pythonhosted.org/packages/29/b5/560ff4472c33285b91af435815c7e9cff4c2acc01620fd0f80a59d71e345/google-play-scraper-0.1.1.tar.gz (49kB)
[K     |████████████████████████████████| 51kB 1.6MB/s 
[?25hBuilding wheels for collected packages: google-play-scraper
  Building wheel for google-play-scraper (setup.py) ... [?25l[?25hdone
  Created wheel for google-play-scraper: filename=google_play_scraper-0.1.1-cp36-none-any.whl size=22260 sha256=8be37ae33e673ca8419247bb4e8557db3be2834b0a19385de63ec4326a607edc
  Stored in directory: /root/.cache/pip/wheels/d7/1f/71/e2b30aab85297ad6dd2e3049587a6763cfb7e803a0b76d982e
Successfully built google-play-scraper
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-0.1.1


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from google_play_scraper import Sort, reviews, app


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

#### Download and save app reviews data


In [None]:
# Package name for each app
app_packages = ['com.flipkart.android', 
                'com.ubercab',
                'com.olacabs.customer', 
                'com.instagram.android',
                'com.facebook.katana',
                'com.whatsapp',
                'com.linkedin.android',
                'com.google.android.talk',
                'com.applicatiom.zomato'
                ]

In [None]:
# To download a balanced dataset w.r.t ratings for each app
def getReviews(num_reviews, app_packages, cols):
  num_reviews_per_app = num_reviews
  app_reviews = []
  for rating in range(1,6):
    for app_name in tqdm(app_packages):
      rvs,_ = reviews(app_name, 
              lang='en', 
              country='in', 
              sort=Sort.MOST_RELEVANT,
              filter_score_with=rating, 
              count=num_reviews_per_app)
      for review in rvs:
        review['appid'] = app_name   # adding app name to each review
      app_reviews.extend(rvs)

  # Saving reviews to CSV file
  df = pd.DataFrame(app_reviews)
  return df[cols]

In [None]:
cols = ['userName', 'content', 'score', 'appid']

In [None]:
# downloading reviews 
reviews_df = getReviews(6000, app_packages, cols)

100%|██████████| 9/9 [01:23<00:00,  9.28s/it]
100%|██████████| 9/9 [02:20<00:00, 15.63s/it]
100%|██████████| 9/9 [01:55<00:00, 12.87s/it]
100%|██████████| 9/9 [01:25<00:00,  9.54s/it]
100%|██████████| 9/9 [01:11<00:00,  7.97s/it]


In [None]:
reviews_df.shape

(238975, 4)

In [None]:
def getLabel(rating):
  if rating >= 3:
    return 1
  else:
    return 0
reviews_df['label'] = reviews_df['score'].apply(lambda score: getLabel(score))

In [None]:
reviews_df.columns

Index(['userName', 'content', 'score', 'appid', 'label'], dtype='object')

In [4]:
drive_path = "drive/My Drive/DataScience_Projects/"
file_name = drive_path + "reviews.csv"

In [None]:
# Saving datasets to drive
reviews_df.to_csv(file_name, index=False)

### Handling class imbalance

In [5]:
reviews_df = pd.read_csv(file_name)

In [8]:
reviews_df.columns

Index(['userName', 'content', 'score', 'appid', 'label'], dtype='object')

In [9]:
reviews_df[['content', 'score']].head(50)

Unnamed: 0,content,score
0,It is high time flipkart stop paying celebriti...,1
1,Worst thing is scrolling scrolling scrolling.....,1
2,For the past 3-4 years I face this issue every...,1
3,Device : One Plus 7. There is something wrong ...,1
4,Suggestions 1) There should be one touch butto...,1
5,"🙏🙏🙏 Nice, All item's are good quality.Everythi...",1
6,"Flipkart, I'm using this since it launched. Bu...",1
7,"Cant login to my account with OTP, OTP is auto...",1
8,FLIPKART coin system is really bad I just trie...,1
9,0 stars. I have been playing games on flipkart...,1


In [None]:
reviews_df['label'].value_counts()

1    144000
0     94975
Name: label, dtype: int64

In [None]:
# Balancing classes by undersampling
min_sample_size = reviews_df['label'].value_counts().min()

class1_df = reviews_df[reviews_df['label'] == 1].head(min_sample_size)
class2_df = reviews_df[reviews_df['label'] == 0].head(min_sample_size)

class1_df.shape, class2_df.shape

((94975, 5), (94975, 5))

### Preparing training, validation & test set

In [None]:
def getTrainingDatasets(df, train_size):
  train_df, test_df = train_test_split(df,train_size=0.7)
  val_df, test_df = train_test_split(test_df,train_size=0.5)
  return train_df, val_df, test_df

train1_df, val1_df, test1_df = getTrainingDatasets(class1_df, 0.7)
train2_df, val2_df, test2_df = getTrainingDatasets(class2_df, 0.7)

train1_df.shape, val1_df.shape, test1_df.shape, train2_df.shape, val2_df.shape, test2_df.shape, 

((66482, 5), (14246, 5), (14247, 5), (66482, 5), (14246, 5), (14247, 5))

In [None]:
train_df = pd.concat([train1_df, train2_df])
val_df = pd.concat([val1_df, val2_df])
test_df = pd.concat([test1_df, test2_df])

train_df.shape, val_df.shape, test_df.shape, 

((132964, 5), (28492, 5), (28494, 5))

In [None]:
train_df.to_csv(drive_path+"train.csv", index=False)
val_df.to_csv(drive_path+"validation.csv", index=False)
test_df.to_csv(drive_path+"test.csv", index=False)