In [1]:
# just pathing - ignore this
import os
while  not 'juxtorpus' in os.listdir():
    os.chdir('../')
assert 'juxtorpus' in os.listdir(), f"Working directory should be at juxtorpus. But at {os.getcwd()}"
f"Working directory: {os.getcwd()}"

'Working directory: /Users/hcha9747/workspace/juxtorpus'

# Australian Text Analytics Platform (ATAP)
The Australian Text Analytics Platform aims to bring computational text analysis to the hands of students and researchers with little coding experience through easy to use widgets and library of functions on jupyter notebooks.

In [2]:
import pandas as pd
from juxtorpus.corpus import Corpus
corpus = Corpus.from_dataframe(pd.read_excel('./notebooks/workshops/amazon_us_reviews_gift_cards.xlsx'), col_doc='review_body', name='gifts')
corpus.meta['product_category'].astype('category')
corpus.meta['marketplace'].astype('category')
corpus.meta['review_date'].astype('datetime', strftime='%Y-%M-%d')
corpus.meta['verified_purchase'].astype('bool')

In [4]:
from juxtorpus.corpus import Corpora
corpora = Corpora([corpus])

In [5]:
corpora.widget()

VBox(children=(Button(description='Upload a Corpus', layout=Layout(width='300px'), style=ButtonStyle()), Label…

# Slicing A Corpus
Slicing is a main feature of `Corpus`. 

> **Every time you slice a corpus, it comes back as a `Corpus`** 

so you can use the same set of functions or analysis you've performed on your subcorpus.

### filter_by_item

In [6]:
verified = corpus.slicer.filter_by_item('verified_purchase', True)
unverified = corpus.slicer.filter_by_item('verified_purchase', False)

In [7]:
print(f"""
Corpus:              {len(corpus)} documents
Verified purchase:   {len(verified)} documents
Unverified purchase: {len(unverified)} documents
""")


Corpus:              149086 documents
Verified purchase:   136042 documents
Unverified purchase: 13044 documents



### filter_by_range

In [8]:
min_2_stars = corpus.slicer.filter_by_range('star_rating', min_=2)
min_3_stars = corpus.slicer.filter_by_range('star_rating', min_=3)
min_2_max_4_stars = corpus.slicer.filter_by_range('star_rating', min_=2, max_=4)
max_5_stars = corpus.slicer.filter_by_range('star_rating', max_=5)

In [9]:
print(f"""
Corpus:          {len(corpus)} documents
>2 stars:        {len(min_2_stars)} documents
>3 stars:        {len(min_3_stars)} documents
2< stars <4:     {len(min_2_max_4_stars)} documents
>5 stars:        {len(max_5_stars)} documents
""")


Corpus:          149086 documents
>2 stars:        144293 documents
>3 stars:        142724 documents
2< stars <4:     4725 documents
>5 stars:        19377 documents



### filter_by_datetime

In [10]:
corpus.meta['review_date'].series.min(), corpus.meta['review_date'].series.max()

(Timestamp('2004-01-10 00:11:00'), Timestamp('2015-01-31 00:08:00'))

In [11]:
year_2015_a = corpus.slicer.filter_by_datetime('review_date', start='2015-01-01', end='2016-01-01')
year_2015_b = corpus.slicer.filter_by_datetime('review_date', start='2015', end='2016', strftime='%Y')
pd.concat([year_2015_a.summary(), year_2015_b.summary()], axis=1)

Unnamed: 0,Unnamed: 1,Unnamed: 2
Name,inventive-kiwi,magic-asp
Parent,gifts,gifts
Corpus Type,Corpus,Corpus
Number of Documents,44000,44000
Number of Total Words,467217,467217
Size of Vocabulary,10571,10571
mean Words per Document,10,10
min Words per Document,0,0
max Words per Document,846,846
metas,"marketplace, customer_id, review_id, product_i...","marketplace, customer_id, review_id, product_i..."


### filter_by_regex

In [12]:
amazon_egifts = corpus.slicer.filter_by_regex('product_title', 'Amazon.com eGift Card')
amazon_any = corpus.slicer.filter_by_regex('product_title', r'amazon.*', ignore_case=True)

In [13]:
print(f"""
Corpus:            {len(corpus)} documents
Amazon (eGifts):   {len(amazon_egifts)} documents
Amazon (Any):      {len(amazon_any)} documents
""")


Corpus:            149086 documents
Amazon (eGifts):   37108 documents
Amazon (Any):      149008 documents



### filter_by_condition

In [14]:
def even_number(id_):
    return id_ % 2 == 0
def odd_number(id_):
    return id_ % 2 == 1

even_customer_id = corpus.slicer.filter_by_condition('customer_id', even_number)
odd_customer_id = corpus.slicer.filter_by_condition('customer_id', odd_number)

In [15]:
print(f"""
Corpus:                      {len(corpus)} documents
customer_id (even number):   {len(even_customer_id)} documents
customer_id (odd number):    {len(odd_customer_id)} documents
""")


Corpus:                      149086 documents
customer_id (even number):   74503 documents
customer_id (odd number):    74583 documents



### Slice with Widgets

In [16]:
corpora.widget()

VBox(children=(Button(description='Upload a Corpus', layout=Layout(width='300px'), style=ButtonStyle()), Label…

# Corpus-subcorpus Relationship

## Memory Footprint (Bit of computer science)

Every subcorpus just references its parent corpus following all to way to the root corpus (the original corpus you built)

This allows us to keep the memory footprint of the notebook small so that you can be liberal with your dataset size.

In [18]:
subsubcorpus = corpora[corpora.items()[-1]]
print(f"""
Subsubcorpus:  {subsubcorpus.name}
Parent:        {subsubcorpus.parent.name}
Root:          {subsubcorpus.find_root().name}
""")


Subsubcorpus:  flawless-parakeet
Parent:        gifts
Root:          gifts



## Detaching a Corpus

You can also detach a subcorpus from its tree.
Why would you want to do this:
1. sometimes you have to - part of the design. (e.g. creating a custom DTM from a subcorpus)
2. mostly revolves around having to do performing operations on a subcorpus to change its state. Like processing it with spacy for content slicing.

In [20]:
detached = subsubcorpus.detached()

In [21]:
pd.concat([detached.summary(), subsubcorpus.summary()], axis=1)

AttributeError: 'NoneType' object has no attribute 'name'

In [22]:
subsubcorpus.parent.name

'gifts'

# Document Content Slicing

Sometimes you want to slice the corpus based on the ***content*** of your documents.
e.g.
1. Give me all texts that contain **at least 3 mentions of this word**.
2. Give me all texts that contain **this word AND this word**
3. Give me all texts that contain **this word in the first 5 sentences** of the text
4. Give me all the texts that **contain at least X tokens or at least X utterances (conversation length)**

In [1]:
# just pathing - ignore this
import os
while  not 'juxtorpus' in os.listdir():
    os.chdir('../')
assert 'juxtorpus' in os.listdir(), f"Working directory should be at juxtorpus. But at {os.getcwd()}"
f"Working directory: {os.getcwd()}"

import pandas as pd
from juxtorpus.corpus import Corpus
corpus = Corpus.from_dataframe(pd.read_excel('./notebooks/workshops/amazon_us_reviews_gift_cards.xlsx', nrows=10_000), col_doc='review_body', name='gifts')
corpus.meta['product_category'].astype('category')
corpus.meta['marketplace'].astype('category')
corpus.meta['review_date'].astype('datetime', strftime='%Y-%M-%d')
corpus.meta['verified_purchase'].astype('bool')

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
# nlp = spacy.load('pymusa')

In [3]:
from juxtorpus.corpus.processors import process

scorpus = process(corpus, nlp=nlp)

Spacy Corpus:   0%|          | 0/10000 [00:00<?, ?it/s]

In [4]:
# Give me all texts that contain at least 3 mentions of this word
from spacy.matcher import Matcher

word = 'opinion'
matcher = Matcher(scorpus.nlp.vocab)
matcher.add('has_word', patterns=[
    [{'ORTH': f'{word}'}]
])
scorpus_word = scorpus.slicer.filter_by_matcher(matcher, min_=1)
len(scorpus_word)

5

In [5]:
# compound nouns - using parts of speech
compound_nouns = Matcher(scorpus.nlp.vocab)
compound_nouns.add('cn', patterns = [
    [{'POS': 'NOUN', "OP": "+"}]
])
scorpus_cn = scorpus.slicer.filter_by_matcher(compound_nouns)
scorpus_cn.docs().apply(lambda d: d.text)

0                  Great birthday gift for a young adult.
1       It's an Amazon gift card and with over 9823983...
4       I can't believe how quickly Amazon can get the...
5       Perfect!  Nice & easy plus everyone loves them...
6                                                excelent
                              ...                        
9992    These are my last resort when I really missed ...
9994    This was provided by my daughter as a birthday...
9997    I love how easy it is to add money to an accou...
9998                          Nice tin with gift card....
9999                                      I don,t like it
Name: document, Length: 6965, dtype: object

In [6]:
scorpus_cn.meta['_matched'].series

0                                    birthday gift, adult
1                                        gift card, items
4                                                   hands
5                                                    gift
6                                                excelent
                              ...                        
9992                                               resort
9994    daughter, birthday gift, usefulness, holiday, ...
9997                               money, account, things
9998                                       tin, gift card
9999                                                    t
Length: 6965, dtype: object

## Not sure what to do with below logic

In [None]:
scorpus.meta

In [None]:
scorpus.meta['ner'].head(n=5)