In [1]:
import datasets
import torch
import sys
import os
import polars as pl

# Move to top-level dir so we can run our local code
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import local modules
import src.data.switchboard.core as S


  from .autonotebook import tqdm as notebook_tqdm


N_CPUS 10


In [48]:
ds = S.load_dataset_raw()

df = ds["test"]
df = df.drop(["swda_filename", "length", "ptb_basename", "ptb_treenumbers", "transcript_index", "act_tag", "damsl_act_tag",  "pos", "trees"])

In [64]:

def cast_cols(df):
    return df.select(
        [
            pl.col("text"),
            pl.col("subutterance_index").cast(pl.UInt8),
            pl.col("conversation_no").cast(pl.UInt16),
            pl.col("utterance_index").cast(pl.UInt16),
            pl.col("prompt").cast(pl.Categorical),
            pl.col("caller").cast(pl.Categorical),
        ]
    )

def group_utters(df):
    return df.with_columns(pl.col("text").str.concat(" ").over(["conversation_no", "utterance_index"])).filter(pl.col("subutterance_index") == 1).drop("subutterance_index")

def clean_text(df):
    return df.with_columns(
        pl.col("text").str.replace_all("(\{\w*)|( })|( \[)|( \+)|( \])|( /)|(<>)", "").str.replace_all("<Laughter>|<laughter>", "Haha").str.strip()
    )

df.lazy().pipe(cast_cols).pipe(group_utters).pipe(clean_text).collect()

text,conversation_no,utterance_index,prompt,caller
str,u16,u16,cat,cat
"""Okay, uh, co...",2121,1,"""PLEASE DISCUSS...","""A"""
"""Well, it's har...",2121,2,"""PLEASE DISCUSS...","""B"""
"""Um, well, you...",2121,3,"""PLEASE DISCUSS...","""A"""
"""Uh-huh.""",2121,4,"""PLEASE DISCUSS...","""B"""
"""especially aro...",2121,5,"""PLEASE DISCUSS...","""A"""
"""No, I'm more ...",2121,6,"""PLEASE DISCUSS...","""B"""
"""Okay, so ca-...",2121,7,"""PLEASE DISCUSS...","""A"""
"""How about you?...",2121,8,"""PLEASE DISCUSS...","""B"""
"""Can you notice...",2121,9,"""PLEASE DISCUSS...","""A"""
"""Uh-huh.""",2121,10,"""PLEASE DISCUSS...","""B"""


In [5]:
rawds = S.load_dataset_raw()
rawds["test"]["text"]



['Okay, {F uh, }  /',
 'could you tell me what you think contributes most to, {F uh, } air pollution? /',
 "{D Well, } it's hard to say.  /",
 "{E I mean, } while it's certainly the case that things like automobiles and factories, {F uh, } pollute a lot, {F uh, } if you look at how much pollution is say kicked up by an active volcano, {F uh, } it's certainly less than clear that anything man can do in this sort of scale of things has much effect at all.  /",
 'What do you think? /',
 '{F Um, } {D well, } you talked about, {F uh, } volcanos.  /',
 "I'm not sure how many active volcanos there are now, [ and, + and ] what the amount of material that they do, {F uh, } put into the atmosphere.  /",
 'I think probably the greatest cause is, {F uh, } vehicles,',
 'Uh-huh. /',
 'especially around cities.  /',
 '{F Um, } {F uh, } do you live right in the city itself? /',
 'No,  /',
 "I'm more out in the suburbs,  /",
 '{C but } I certainly work near a city. /',
 'Okay,  /',
 '{C so } [ ca-, +',

In [3]:
from pathlib import Path


DATA_PATH = Path("../data")
DATA_PATH / "test"

PosixPath('../data/test')

In [None]:
import polars as pl

pl.read_parquet()

In [65]:
{} or 1

1