In [115]:
import json
import random
from pathlib import Path

import pandas as pd
import spacy
import textacy
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
data_dir = Path("../data").resolve()
assert data_dir.exists()
data_dir.ls()

[PosixPath('/home/nirant/AppReview/data/Clubhouse_us_app_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/frequency_count.json'),
 PosixPath('/home/nirant/AppReview/data/Moj_us_app_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/Uber_us_app_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/com.ubercab_us_play_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/Netflix_us_app_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/Moj_in_app_store_reviews.json'),
 PosixPath('/home/nirant/AppReview/data/IndiaGold_in_app_store_reviews.json')]

Note:

Here we will explore App Reviews for just one app: Uber (Passenger/Cab, not the Driver). The additional data to reproduce this for other clients is left as an exercise for you.

But to get an overview of all of them, we combine them into a larger single text string and explore them. 

In [8]:
file_path = data_dir / "Uber_us_app_store_reviews.json"; assert file_path.exists()
with file_path.open("r") as f:
    raw_data = pd.read_json(f)
    reviews = " ".join(raw_data["review"].to_list())
# print(type(reviews), reviews)

In [34]:
reviews = []
files = [x for x in data_dir.ls() if "app" in x.name]
for file_path in files:
    try:
        with file_path.open("r") as f:
            file_data = pd.read_json(f)
            reviews += file_data["review"].to_list()
    except ValueError as e:
        print(f"Value Error with {file.name}")
    except KeyError as ke:
        print(f"Key error with {file.name}")

In [35]:
nlp = spacy.load("en_core_web_trf")

In [36]:
len(reviews)

6049

Since we've limited memory on this machine, disabling memory heavy parts of the pipeline. If we combine all reviews into one large chunk -- this is what we get. 

```python-traceback
ValueError: [E088] Text of length 2737687 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`
 ```

In [32]:
# nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
# nlp.max_length = len(reviews) + 1
# nlp(reviews)

Even after disabling the components, it will take extremely long durations and a lot of memory to process our relatively "small data". The solution? Batch your data

In [37]:
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

In [155]:
%time reviews = [rev.strip() for rev in reviews]

CPU times: user 1.36 ms, sys: 0 ns, total: 1.36 ms
Wall time: 1.37 ms


In [160]:
from functools import lru_cache

In [204]:
from collections import Counter
from typing import List


class TextSummary:
    def __init__(self, records: List[str]):
        self.token_summary = self.make_summary(records)
        self.token_stats = self.get_corpora_stats(self.token_summary)

    def make_summary(self, records: List[str]):
        token_vocab = []
        for doc in tokenizer.pipe(records):
            token_vocab.append(Counter([str(x) for x in doc]))

        """Get a Count Distribution for the entire Corpora"""
        count_summary = token_vocab[0]
        for record in tqdm(token_vocab[1:]):
            count_summary += record

        return count_summary

    def get_corpora_stats(self, summary: List[Counter]):
        self.vocab = list(summary.keys())
        self.vocab_sz = len(self.vocab)
        self.size = sum(summary[key] for key in summary.keys())
        return {"size": self.size, "vocab_sz": self.vocab_sz, "vocab": self.vocab}


#         count, uniques = 0, []
#         for record in summary:
#             record_sz = sum([record[key] for key in record.keys()])
#             uniques.extend(list(record.keys()))
#             count += record_sz

#         """Get Unique Words and Their Count"""
#         uniques = set(uniques)
#         unique_count = len(uniques)
#         return count, uniques, unique_count

In [205]:
tsm = TextSummary(reviews)

  0%|          | 0/6048 [00:00<?, ?it/s]

In [201]:
tsm.token_summary.most_common(25)

[('.', 22666),
 ('I', 21739),
 ('the', 18346),
 ('to', 17547),
 ('and', 14836),
 (',', 12667),
 ('a', 10802),
 ('it', 9598),
 ('is', 6861),
 ('that', 6747),
 ('of', 6681),
 ('my', 5937),
 ('you', 5886),
 ('for', 5472),
 ('!', 5173),
 ('have', 4827),
 ('n’t', 4820),
 ('on', 4607),
 ('in', 4589),
 ('app', 4377),
 ('but', 4058),
 ('this', 3964),
 ('was', 3815),
 ('me', 3523),
 ('’s', 3475)]