In [1]:
import pandas as pd
import json
import os
import time
from tqdm import tqdm
from typing import List, Dict, Any
import warnings
from datasets import load_dataset
import random
from loguru import logger

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!export HF_HOME=/scratch/users/k24053411

In [3]:
print("Loading Besstie dataset from Hugging Face...")
dataset = load_dataset("unswnlporg/BESSTIE", split =  "validation")

print(f"Loaded Dolly dataset with {len(dataset)} total samples")
print(f"Dataset columns: {list(dataset.features.keys())}")

Loading Besstie dataset from Hugging Face...


Generating train split: 100%|██████████| 13349/13349 [00:00<00:00, 119038.51 examples/s]
Generating validation split: 100%|██████████| 1825/1825 [00:00<00:00, 101646.68 examples/s]

Loaded Dolly dataset with 1825 total samples
Dataset columns: ['source', 'variety', 'task', 'text', 'label']





In [4]:
dataset[0:5]

{'source': ['Reddit', 'Reddit', 'Reddit', 'Reddit', 'Reddit'],
 'variety': ['en-AU', 'en-AU', 'en-AU', 'en-AU', 'en-AU'],
 'task': ['Sentiment', 'Sentiment', 'Sentiment', 'Sentiment', 'Sentiment'],
 'text': ["Spaghetti bolognaise - $6ish for 500g of mince, $2-3 for a jar of sauce, 50c worth of pasta, serves 4 people ($2,50 per serving). If you buy more mince in bulk and more pasta in bulk and make your own sauce it can be as little as $1 per serving. \nBuy cheap cuts of meat, cut it up yourself, fry it up with some soy sauce, onions, garlic, a few handfuls of cooked rice,  some chopped carrots and beans, crack an egg in towards the end, you've got yourself a quick and easy fried-rice analogue with some more protein and decent flavour, should cost around $2 per person.",
  "Australia is in a tough spot with inflation lingering, so what Governments ( all 3 levels) do in terms of expenditure have a sharp edge.\nThe lifters generate the wealth.\nWe leaners need to be careful with demands f

In [5]:
def copy_leaving_out(dictionary, key):
    copy = dictionary.copy()
    del copy[key]
    return copy

In [6]:
processed_data = {}

for data in dataset:
    task = data["task"]

    data_filtered = copy_leaving_out(data, "task")

    if task in processed_data.keys():
        processed_data[task].append(data_filtered)
    else:
        processed_data[task] = [data_filtered]

In [7]:
for task in processed_data.keys():

    source_dict = {}
    dataset = processed_data[task]
    
    for data in dataset:
        source = data["source"]

        data_filtered = copy_leaving_out(data, "source")

        if source in source_dict.keys():
            source_dict[source].append(data_filtered)
        else:
            source_dict[source] = [data_filtered]
    
    processed_data[task] = source_dict

In [8]:
for task in processed_data.keys():

    source_dict = processed_data[task]

    for source in source_dict.keys():
        
        variety_dict = {}
        dataset = source_dict[source]
        
        for data in dataset:
            variety = data["variety"]

            data_filtered = copy_leaving_out(data, "variety")

            if variety in variety_dict.keys():
                variety_dict[variety].append(data_filtered)
            else:
                variety_dict[variety] = [data_filtered]
        
        source_dict[source] = variety_dict
    
    processed_data[task] = source_dict

In [11]:
processed_data['Sarcasm']['Google']['en-IN'][0]

{'text': "You get some really good food here. It's overpriced for sure but the food is really good. We dined in a very busy and jam packed time, yet I ca n't fault the quality of food. The butter chicken was really good as were the garlic naan and rumali roti. The mutton was amazing. The cuts were beautifully cooked and you could suck the marrow from it. Lastly, the mocktail was fresh, refreshing and served as a great palate cleanser for the rich meal. I have no issues with the quantity as well. This place is well worth the money.",
 'label': 0}

In [12]:
processed_data['Sentiment']['Reddit']['en-IN']

[{'text': "First question  Why??\nSecond question Do you want to die?\nThe adaptar has capacitors and using petrol will cause a spark and then . So don't",
  'label': 0},
 {'text': 'I usually go for biking but its raining. We moved to a new place recently and there are a few things to unpack and set, I am too lazy to do that myself. My partner is busy playing video games.',
  'label': 0},
 {'text': "They have made movies in the past, no one saw them, it was about football in a village, can't remember the name ... But I think TVF makes good content with grounded stories which doesn't translate to good films, usually good films have different genres, let's see if they can capture that space.",
  'label': 1},
 {'text': "Wasn't there accusations he killed sunanda puskar", 'label': 0},
 {'text': "It's not got a lot to do with religion. North eastern states, especially when excluding Assam, have mostly always been ruled by whoever is in power at the centre (or their allies or coalition partn

In [13]:
import json
with open('data/instruction/besstie/test.json', 'w') as f:
    json.dump(processed_data, f)