In [1]:
import requests

import textacy
import tarfile
from fastcore.utils import Path
import pandas as pd
Path.ls = lambda x: list(x.iterdir())
from typing import Dict

In [2]:
def extract(tar_url, extract_path='.')->None:
    """Function to extract tar files

    Args:
        tar_url ([type]): [description]
        extract_path (str, optional): [description]. Defaults to '.'.
    """
    tar = tarfile.open(tar_url, 'r')
    for item in tar:
        tar.extract(item, extract_path)
        if item.name.find(".tgz") != -1 or item.name.find(".tar") != -1:
            extract(item.name, "./" + item.name[:item.name.rfind('/')])
    return None

In [3]:
def download_file(url: str)->Path:
    """[summary]

    Args:
        url (str): [description]

    Returns:
        Path: [description]
    """
    r = requests.get(url, allow_redirects=True)
    p = Path(url)
    open(p.name, 'wb').write(r.content)
    filepath = Path(p.name).resolve()
    return filepath 

In [4]:
def get_data(input_path: str, nrows: int = 10):
    p = Path(Path(input_path).stem).resolve()
    if p.is_dir() and p.exists():
        return p
    filepath = download_file(input_path)
    assert filepath.exists()
    extract(filepath)
    return Path(filepath.stem).resolve()

In [5]:
filepath = get_data("https://files.fast.ai/data/aclImdb.tgz"); filepath

Path('/Users/nirant/Documents/experiments/statsdemo/aclImdb')

# Text Analytics

## Get Data from Google Sheets

In [6]:
sheet = "Queries150"
data_url = "https://api.steinhq.com/v1/storages/5f619d225d3cdc44fcd7d4b1"

In [7]:
df = pd.read_json(f"https://api.steinhq.com/v1/storages/5f619d225d3cdc44fcd7d4b1/{sheet}")

In [8]:
df

Unnamed: 0,QueryText
0,Why they asking for restart my app
1,‡§Æ‡•á ‡§Ö‡§™‡§®‡•á balance ‡§ï‡§æ ‡§Ø‡•Ç‡§ú‡§...
2,Until what can I book online?
3,hi
4,I need to buy 23 berths
...,...
144,Hi
145,Good morning
146,Hi
147,Terms


In [9]:
spacy_records = df["QueryText"].apply(lambda x: textacy.make_spacy_doc(x, lang="en"))

In [10]:
min_token_count = 2
long_records = [record for record in spacy_records if len(record) >= min_token_count]
len(long_records)

81

In [11]:
corpus = textacy.Corpus("en", data=list(long_records))

In [12]:
corpus.n_docs, corpus.n_sents, corpus.n_tokens

(81, 92, 613)

In [13]:
# (ngrams=2, as_strings=True, enfilter_nums=False)
# [doc._.to_terms_list(ngrams=2, entities=True, as_strings=True) for doc in corpus]

In [14]:
min_freq_count = 1

freq_dict: Dict = corpus.word_counts(as_strings=True)

freq_dict = {
    k: v
    for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)
    if v > min_freq_count
}

pd.DataFrame(freq_dict, index=["Frequency"]).transpose()

Unnamed: 0,Frequency
-PRON-,71
ticket,21
book,20
cancel,11
train,9
refund,5
receive,5
booking,4
bus,4
need,3


In [15]:
import textacy.vsm  # note the import
def make_doc_term_matrix(corpus: textacy.Corpus, ngrams:int = 2, entities:bool = True):
    vectorizer = textacy.vsm.Vectorizer(
        tf_type="linear",
        apply_idf=True,
        idf_type="smooth",
        norm="l2",
        min_df=2,
        max_df=0.95,
    )
    doc_term_matrix = vectorizer.fit_transform(
        (doc._.to_terms_list(ngrams=2, entities=True, as_strings=True) for doc in corpus)
    )
    return doc_term_matrix

doc_term_matrix = make_doc_term_matrix(corpus)
repr(doc_term_matrix)

"<81x4 sparse matrix of type '<class 'numpy.float64'>'\n\twith 11 stored elements in Compressed Sparse Row format>"

In [16]:
import textacy.tm  # note the import

n_topics = 4
model = textacy.tm.TopicModel("nmf", n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=5):
    print("topic", topic_idx, ":", "   ".join(top_terms))

NameError: name 'vectorizer' is not defined