In [None]:
# please run this cell first. (Click on this cell and click the play button ▶️ near the top of the notebook)
# this just sets up the notebook correctly.
import os
while not 'juxtorpus' in os.listdir(os.getcwd()):
    os.chdir("../")
assert 'juxtorpus' in os.listdir(os.getcwd()), "Current working directory not at project root"
"Successful. Please continue"

# Corpus Slicer
The Corpus slicer is a widget that allows you to Upload and build a Corpus. 

A Corpus requires a name for identification with (randomly assigned if not provided) before you build it.

You can then slice the corpus as needed using the slicing widget. Each sliced subcorpus will also require a name.

## Manual Instructions

#### 1. Run the following cell and a widget will pop up.

#### 2. Upload your Corpus, select it and click "Confirm".
- Supported formats: .csv, .xlsx and .zip.<br>
(For .zip, the zip must contain only .txt files and the only the file name will be automatically created as meta data.)

#### 3. Tick One column as your 'document' column.
#### 4. Untick the meta data columns that you don't want.
#### 5. Select a data type for your meta data column. (This affects the subsequent slicing widget available)
- You may also keep the data type as 'auto' and it'll try it's best to infer what data type it is.
- If you have a 'datetime' column, please select the data type: 'datetime'. Infering datatypes usually doesn't work well with datetimes.
#### 6. Type in a name and Click "Confirm" to build your Corpus.
#### 7. Click the "Hide" button at the Top to hide the builder.
#### 8. Tick the Corpus you've just created and the Slicer widget will pop down.
#### 9. Select your meta data column on the right and conditions in the middle then click on "Add Operation" on the right.
- You can add multiple operations on different meta data columns.
#### 10. Tick/Untick the operations you would like to perform on your Corpus.
#### 11. Enter a name for your Subcorpus and click on "Slice". (A new Corpus with your entered name will appear above the slicer widget)
#### 12. Repeat from step 8 as needed.
#### 13. Click on 'Download' on the column (most right) for the Corpus you want to download.

## 🎥 Video Instructions
👉 Click on the cell below and run the cell by clicking on the ▶️ button at the top of the notebook.

‼ Before you do, please make sure you've also run the very first cell.

In [None]:
import ipywidgets as ipw
path = "./assets/slicer-demo.mov"
ipw.Video.from_file(path)

## Run this!

In [None]:
from juxtorpus.corpus.corpora import Corpora

corpora = Corpora()
corpora.widget()

## Content Slicing

The first cell is the only cell you need to modify. Once you're done, run the next cell.

You can reconfigure the first cell and run the sequence again.

In [None]:
corpus_name = 'foamy-saluki'                      # replace with the corpus name you want to slice (retain the single quotes)
sliced_corpus_name = 'sliced'                     # replace with a name you want to assign the sliced corpus.
patterns = [
    [{'ORTH': 'Exact-match'}],                                               # match this
    [{'LOWER': 'case-insensitive'}],                                         # OR this
    [{'LOWER': {'REGEX': r'(regex pattern)'}}],                              # OR this
    [{'LOWER': 'match'}, {'LOWER': 'in'}, {'LOWER': 'sequence'}],            # OR this - multiword
    [{'LOWER': 'match'}, {'ORTH': 'In-SeQuenCe'}],                           # OR this - multiword 
]

In [None]:
import spacy
from spacy.matcher import Matcher
from juxtorpus.corpus.processors import process
from juxtorpus.corpus import SpacyCorpus

corpus = corpora[corpus_name]
nlp = spacy.load('en_core_web_sm')
if not isinstance(corpus, SpacyCorpus):
    corpus = process(corpus, nlp=nlp)

matcher = Matcher(nlp.vocab)
matcher.add('patterns', patterns)

sliced = corpus.slicer.filter_by_matcher(matcher)
sliced.name = sliced_corpus_name

corpora.add(sliced)
corpora.widget()

In [None]:
# run this cell to examine the documents in your sliced corpus
# ⚠️ if you saw a warning message like 'sliced already exists. It renamed to sliced_', then this won't work directly.
#  replace sliced_corpus_name with '<name it was renamed to>'

for i, doc in enumerate(corpora[sliced_corpus_name].docs()):
    print(f"[{i}] {doc}")
    print("="*100)