In [None]:
import json
import random
from pathlib import Path
from typing import List, Optional

import pandas as pd
import spacy
import textacy
from pydantic import BaseModel
from spacy.lang.en import English
from textacy import extract
from textacy.representations.vectorizers import Vectorizer
from textacy.similarity.tokens import jaccard
from textacy.tm import TopicModel
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2
Path.ls = lambda x: list(x.iterdir())

The main contribution here is a programmatic way to find labels for topic models, and then classify documents into them -- but while still retaining some degree of human intervention.

In [None]:
data_dir = Path("../data").resolve()
assert data_dir.exists()
data_dir.ls()

Note:

Here we will explore App Reviews for just one app: Uber (Passenger/Cab, not the Driver). The additional data to reproduce this for other clients is left as an exercise for you.

But to get an overview of all of them, we combine them into a larger single text string and explore them. 

In [None]:
file_path = data_dir / "Uber_us_app_store_reviews.json"; assert file_path.exists()
with file_path.open("r") as f:
    raw_data = pd.read_json(f)
    reviews = raw_data["review"].to_list()

In [None]:
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_trf")

In [None]:
%time reviews = [rev.strip() for rev in reviews]
len(reviews)

In [None]:
%time corpus = textacy.Corpus("en_core_web_sm", data=reviews)

In [None]:
corpus.n_docs, corpus.n_sents, corpus.n_tokens

In [None]:
word_counts = corpus.word_counts(by="lemma_", filter_stops= True, filter_nums=True, filter_punct=True)
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:25]

# TODO
- [ ] Topic Modeling
- [ ] Noun Chunks
- [ ] Verb
- [ ] SVO Triples

## Topic Modeling with Textacy via Scikit-Learn

In [None]:
tokenized_docs = ((term.lemma_ for term in textacy.extract.terms(doc, ngs=1, ents=True)) for doc in corpus)

In [None]:
vectorizer = Vectorizer(tf_type="linear", idf_type="smooth", norm="l2",min_df=3, max_df=0.95)

In [None]:
doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
doc_term_matrix

In [None]:
n_topics = 10

In [None]:
model = TopicModel("nmf", n_topics=n_topics)

In [None]:
model.fit(doc_term_matrix)

In [None]:
doc_topic_matrix = model.transform(doc_term_matrix)

In [None]:
class Topic(BaseModel):
    title: Optional[str]
    terms: Optional[List[str]]
    top_docs_idx: Optional[List[int]]
    keyterms: Optional[List[str]] = []
    linguistic_terms: Optional[List[str]] = []
    
lst_topics = []

In [None]:
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=range(n_topics)):
    print(f"Topic #{topic_idx}:", "\t".join(top_terms))
    lst_topics.append(Topic(terms=[str(term) for term in top_terms]))

In [None]:
# Top documents for eah topic
for topic_idx, top_docs_idx in model.top_topic_docs(doc_topic_matrix, weights=True, top_n=20):
#     print(f"{topic_idx}: {top_docs_idx}")
    lst_topics[topic_idx].top_docs_idx = top_docs_idx

In [None]:
# Noun Extraction

In [None]:
# topic

In [None]:
for topic in lst_topics:
    for doc_idx, weight in topic.top_docs_idx:
        doc = corpus[doc_idx]
        topic.linguistic_terms.extend([str(term) for term in extract.terms(doc, ngs=[3, 4, 5], ncs=True, ents=True)])
        topic.keyterms.extend([pair[0] for pair in extract.keyterms.yake(doc, ngrams = [3])])
#     print(len(topic.linguistic_terms))
    topic.linguistic_terms = list(set(topic.linguistic_terms)) # remove duplicates across docs
#     print(len(topic.linguistic_terms))

In [None]:
topic.linguistic_terms

In [None]:
for topic in tqdm(lst_topics):
    max_jaccard = 0
    for candidate in topic.linguistic_terms:
        jaccard_score = jaccard(candidate.split(), topic.terms)
        if jaccard_score > max_jaccard:
            topic.title = candidate
            max_jaccard = jaccard(candidate.split(), topic.terms)

In [None]:
for topic in lst_topics:
    print(topic.title, topic.terms)

# Labels

Happy with the app [Skipped]

|Index|Label Title| Label Description|
|---|:---|:---|
|1|payment|Payment Methods|
|2|cancel_fees|Cancellation Fee|
|3|price|Price|
|4|pickup|Pickup|
|5|pool|Pool|
|6|support|Customer Support|
|7|advance_ride|Advance Ride Booking|