In [None]:
import json
import random
from pathlib import Path

import pandas as pd
import spacy
import textacy
from tqdm.notebook import tqdm
from spacy.lang.en import English

%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())

In [None]:
data_dir = Path("../data").resolve()
assert data_dir.exists()
data_dir.ls()

Note:

Here we will explore App Reviews for just one app: Uber (Passenger/Cab, not the Driver). The additional data to reproduce this for other clients is left as an exercise for you.

But to get an overview of all of them, we combine them into a larger single text string and explore them. 

In [None]:
file_path = data_dir / "Uber_us_app_store_reviews.json"; assert file_path.exists()
with file_path.open("r") as f:
    raw_data = pd.read_json(f)
    reviews = " ".join(raw_data["review"].to_list())

In [None]:
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_trf")

In [None]:
%time reviews = [rev.strip() for rev in reviews]
len(reviews)

In [None]:
%time corpus = textacy.Corpus("en_core_web_sm", data=reviews)

In [None]:
corpus.n_docs, corpus.n_sents, corpus.n_tokens

In [None]:
word_counts = corpus.word_counts(by="lemma_", filter_stops= True, filter_nums=True, filter_punct=True)
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:25]

Even after disabling the components, it will take extremely long durations and a lot of memory to process our relatively "small data". The solution? Batch your data

In [None]:
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

In [None]:
from collections import Counter
from typing import List


class TextSummary:
    def __init__(self, records: List[str]):
        self.token_summary = self.make_summary(self.extract_tokens(records))
        self.token_stats = self.get_corpora_stats(self.token_summary)

    def extract_tokens(self, records: List[str]):
        token_vocab = []
        for doc in tokenizer.pipe(records):
            token_vocab.append(Counter([str(x) for x in doc]))
        return token_vocab

    def make_summary(self, record_summaries: List[Counter]):
        """Get a Count Distribution for the entire Corpora"""
        corpora_summary = record_summaries[0]
        for record in tqdm(record_summaries[1:]):
            corpora_summary += record

        return corpora_summary

    def get_corpora_stats(self, summary: List[Counter]):
        self.vocab = list(summary.keys())
        self.vocab_sz = len(self.vocab)
        self.size = sum(summary[key] for key in summary.keys())
        return {"size": self.size, "vocab_sz": self.vocab_sz, "vocab": self.vocab}

In [None]:
tsm = TextSummary(reviews)

In [None]:
tsm.token_summary.most_common(25)

In [None]:
%%time
for doc in tqdm(nlp.pipe(reviews, batch_size=100, n_process=-1), total=len(reviews)):
    for chunk in doc.noun_chunks:
        print(chunk.text)
# with multiproc, n_process=-1: 53s without 84s

## Topic Modeling with Textacy via Scikit-Learn

In [None]:
tokenized_docs = ((term.lemma_ for term in textacy.extract.terms(doc, ngs=1, ents=True)) for doc in corpus)

In [None]:
from textacy.representations.vectorizers import Vectorizer
vectorizer = Vectorizer(tf_type="linear", idf_type="smooth", norm="l2",min_df=3, max_df=0.95)

In [None]:
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
doc_term_matrix

In [None]:
from textacy.tm import TopicModel

In [None]:
model = TopicModel("nmf", n_topics=20)

In [None]:
model.fit(doc_term_matrix)

In [None]:
doc_topic_matrix = model.transform(doc_term_matrix)

In [None]:
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=range(20)):
    print("topic", topic_idx, ":", "   ".join(top_terms))

In [None]:
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=[0,1], top_n=2):
    print(topic_idx)
    for j in top_docs:
        print(j, corpus[j])
        print()