In [1]:
import os
if  not os.getcwd().endswith('juxtorpus'): 
    os.chdir('../')
assert os.getcwd().endswith('juxtorpus'), f"Working directory should be at juxtorpus. But at {os.getcwd()}"
f"Working directory: {os.getcwd()}" 

'Working directory: /Users/hcha9747/workspace/juxtorpus'

# File Upload Widget

In [2]:
from juxtorpus.viz.widgets import FileUploadWidget

uploader = FileUploadWidget()
uploader.render()
# Show here: uploading another set of files will overwrite the previous.

VBox(children=(FileUpload(value={}, accept='.txt, .csv, .xlsx, .zip', description='Upload your files here.\n(.…

In [4]:
uploader.uploaded()

[PosixPath('/var/folders/jh/mqv5k4x56d98m2twhmk4bjdc0000gp/T/tmpal6lqmmn/elonmusk_tweets.csv'),
 PosixPath('/var/folders/jh/mqv5k4x56d98m2twhmk4bjdc0000gp/T/tmpal6lqmmn/elonmusk_tweets_copy.csv')]

# File Checks

In [5]:
from juxtorpus.checks import FileCheckers, FileSizeCheck, EncodingCheck

checks = [
    FileSizeCheck(max_bytes=1_000_000),
    EncodingCheck(expected='utf-8', min_rows_to_check=100)
]
file_checks = FileCheckers(checks)

In [6]:
import pathlib
flagged = file_checks.run(uploader.uploaded())
flagged

{}

In [7]:
file_checks.summary()

Flagged    0
Passed     2
Total      2
Name: File Check Summary, dtype: UInt16

In [8]:
file_checks.passed(), file_checks.flagged()

(['/var/folders/jh/mqv5k4x56d98m2twhmk4bjdc0000gp/T/tmpal6lqmmn/elonmusk_tweets.csv',
  '/var/folders/jh/mqv5k4x56d98m2twhmk4bjdc0000gp/T/tmpal6lqmmn/elonmusk_tweets_copy.csv'],
 {})

# Building the Corpus

In [9]:
from juxtorpus.corpus import CorpusBuilder

builder = CorpusBuilder(file_checks.passed())
builder.show_columns()

Unnamed: 0,All Columns,Added
0,id,
1,created_at,
2,doc,


In [10]:
builder.set_corpus_type('corpus')
builder.set_text_column('doc')
builder.set_nrows(1000)
builder.set_sep(',')
builder.add_meta('created_at', dtype='datetime', lazy=True)
builder.add_meta('id', dtype='int', lazy=False)
builder.show_columns()

Unnamed: 0,All Columns,Added
0,id,created_at
1,created_at,id
2,doc,


In [11]:
corpus = builder.build()

In [12]:
corpus.metas()

frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'id': <SeriesMeta [Id: id]>})

In [13]:
corpus.metas().get('created_at').preview(5)

0   2017-04-05 14:56:29
1   2017-04-03 20:01:01
2   2017-04-03 16:59:35
3   2017-04-03 16:30:19
4   2017-04-02 06:05:23
Name: created_at, dtype: datetime64[ns]

In [17]:
from juxtorpus.corpus import CorpusSlicer
import pandas as pd

slicer = CorpusSlicer(corpus)
corpus_subset = slicer.filter_by_condition('created_at', lambda x: x > pd.Timestamp('2017-04-02'))

"Original: {} documents.".format(len(corpus)), "Slice: {} documents".format(len(corpus_subset))

('Original: 1000 documents.', 'Slice: 8 documents')

In [18]:
corpus_subset.metas().get('id').preview(5)

0    849636868052275200
1    848988730585096192
2    848943072423497728
3    848935705057280001
4    848416049573658624
Name: id, dtype: int64

In [19]:
another_meta = corpus_subset.metas().get('id').apply(lambda x: x*-1)
another_meta

0   -849636868052275200
1   -848988730585096192
2   -848943072423497728
3   -848935705057280001
4   -848416049573658624
5   -848415731502923777
6   -848415356263702528
7   -848398971139629057
Name: id, dtype: int64

In [20]:
from juxtorpus.meta import SeriesMeta

series_meta = SeriesMeta(id_='anothermeta', series=another_meta)

corpus_subset.add_meta(series_meta)

corpus_subset.metas()

frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'id': <SeriesMeta [Id: id]>, 'anothermeta': <SeriesMeta [Id: anothermeta]>})

In [21]:
slicer = CorpusSlicer(corpus_subset)
corpus_subset_subset = slicer.filter_by_condition('anothermeta', lambda x: x > 0)

In [22]:
len(corpus), len(corpus_subset), len(corpus_subset_subset)

(1000, 8, 0)

In [23]:
corpus.metas(), corpus_subset.metas(), corpus_subset_subset.metas()

(frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'id': <SeriesMeta [Id: id]>}),
 frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'id': <SeriesMeta [Id: id]>, 'anothermeta': <SeriesMeta [Id: anothermeta]>}),
 frozendict.frozendict({'created_at': <SeriesMeta [Id: created_at]>, 'id': <SeriesMeta [Id: id]>, 'anothermeta': <SeriesMeta [Id: anothermeta]>}))