# Loading a custom dataset
In HuggingFace hub there are many custom datasets are available for different task and 
HuggingFace provides a functionality through which we can load that custom models also

Datasets provides several laoding scripts to handle loacal and remote datasets

* For CSV Format :- load_dataset("csv",data_files="my_file.csv")
* For Text Files:- load_dataset("text",data_files="my_file.txt")
* For Json Files:- load_dataset("json",data_files="my_file.json")
* For Parquet Files:- load_dataset("parquet",data_files="my_file.parquet")

# Loading a csv file code

In [1]:
from datasets import load_dataset
local_csv_dataset = load_dataset("csv",data_files="diabetes.csv",sep=";")
local_csv_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome'],
        num_rows: 768
    })
})

In [7]:
local_csv_dataset['train'].features

{'Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome': Value(dtype='string', id=None)}

# loading the dataset from a remote location like Github

In [None]:
from datasets import load_dataset
dataset_url = "full path of the remote location for that dataset"
remote_csv_file = load_dataset("csv",data_files=dataset_url,sep=";")
remote_csv_file

# Loading the text file from a remote location

In [None]:
from datasets import load_dataset
dataset_url = "full path of the remote location for that dataset"
remote_text_file = load_dataset("text",data_files=dataset_url)
remote_text_file

# Loading the json file 

In [None]:
from datasets import load_dataset
dataset_url = "full path of the remote location for that dataset"
remote_json_file = load_dataset("json",data_files=dataset_url)
remote_json_file['train'][:2]

In [None]:
# or we can load the data by specifying the field in nested jsnon
from datasets import load_dataset
dataset_url = "full path of the remote location for that dataset"
remote_json_file = load_dataset("json",data_files=dataset_url,field="data")
remote_json_file['train'][:2]

# Variours features for data pre-processing in Dataset Library
Opertations are:
* Shuffle and split
* select and filter
* Rename , remove and flatten
* Map

# Shuffle Method

In [9]:
from datasets import load_dataset
squad = load_dataset("squad",split='train')
squad[0]

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [10]:
# we can easily shuffle the whole dataset with Dataset.shuffle()
squad_shuffled = squad.shuffle(seed=666)
squad_shuffled[0]

{'id': '5727cc873acd2414000deca9',
 'title': 'Oklahoma',
 'context': 'Oklahoma is the 20th largest state in the United States, covering an area of 69,898 square miles (181,035 km2), with 68,667 square miles (177847 km2) of land and 1,281 square miles (3,188 km2) of water. It is one of six states on the Frontier Strip and lies partly in the Great Plains near the geographical center of the 48 contiguous states. It is bounded on the east by Arkansas and Missouri, on the north by Kansas, on the northwest by Colorado, on the far west by New Mexico, and on the south and near-west by Texas.',
 'question': 'Where does Oklahoma rank by land area?',
 'answers': {'text': ['20th'], 'answer_start': [16]}}

In [1]:
#Another way to shuffle the data by creating our own shuffed train adn test split with Dataset.train_test_split()
from datasets import load_dataset
squad = load_dataset("squad",split='train')
dataset = squad.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 78839
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8760
    })
})

# select method

In [3]:
# we can return rows according to a list of indices using Dataset.select()
from datasets import load_dataset
squad = load_dataset("squad",split='train')
indices = [0,10,20,40,80]
example = squad.select(indices)
example

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5
})

In [4]:
# we can create a random sample by chaining with Dataset.shuffle()
from datasets import load_dataset
squad = load_dataset("squad",split='train')
sample = squad.shuffle().select(range(5))
sample

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5
})

# Filter Method

In [5]:
# we can use Dataset.filter() to return rows that match the condition
from datasets import load_dataset
squad = load_dataset("squad",split='train')
squad_filtered = squad.filter(lambda x : x["title"].startswith("L"))
squad_filtered[0]

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

{'id': '56de0fef4396321400ee2583',
 'title': 'Lighting',
 'context': 'Lighting or illumination is the deliberate use of light to achieve a practical or aesthetic effect. Lighting includes the use of both artificial light sources like lamps and light fixtures, as well as natural illumination by capturing daylight. Daylighting (using windows, skylights, or light shelves) is sometimes used as the main source of light during daytime in buildings. This can save energy in place of using artificial lighting, which represents a major component of energy consumption in buildings. Proper lighting can enhance task performance, improve the appearance of an area, or have positive psychological effects on occupants.',
 'question': 'What is used a main source of light for a building during the day?',
 'answers': {'text': ['Daylighting'], 'answer_start': [245]}}

# Rename and Remove column methods

In [6]:
# By using the rename_column() and remove_column() methods to transform our columns
from datasets import load_dataset
squad = load_dataset("squad",split='train')
squad.rename_column("context","passages")


Dataset({
    features: ['id', 'title', 'passages', 'question', 'answers'],
    num_rows: 87599
})

In [7]:
# remove_column method
from datasets import load_dataset
squad = load_dataset('squad',split='train')
squad.remove_columns(['id','title'])

Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 87599
})

# Flatten function
If our dataset has nested columns , then by using Dataset.flatten() we can flatten them1

In [8]:
from datasets import load_dataset
squad = load_dataset("squad",split='train')
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [9]:
squad.flatten()

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
    num_rows: 87599
})

# Map() Methood

In [10]:
#This method is used for applying the functions to the whole dataset
from datasets import load_dataset
squad = load_dataset("squad",split='train')

In [11]:
def lowercase_title(example):
    return {'title':example['title'].lower()}

In [12]:
#applying the map function
squad_lowercase = squad.map(lowercase_title)
#peek at random sample
squad_lowercase.shuffle(seed=42)['title'][:5]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

['egypt',
 'ann_arbor,_michigan',
 'rule_of_law',
 'samurai',
 'group_(mathematics)']

In [13]:
# or we can use this map function on batches also
from transformers import AutoTokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#building a function
def tokenize_title(ex):
    return tokenizer(ex['title'])

In [15]:
squad.map(tokenize_title,batched=True,batch_size=500)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 87599
})

# Datasets + DataFrames = 

In [None]:
# By default , Dataset object will return python obejct when you index it
from datasets import load_dataset
dataset = load_dataset("swiss_judgment_prediction","all",split='train')
dataset[0]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

In [None]:
# convert the output format into the pandas dataframe
dataset.set_format("pandas")
dataset[0]

In [None]:
df= dataset[:]
df.head()

# Saving and Reloading a datasets
we can save our dataset in a different loacation and format
* For Arrow :- Dataset.save_to_disk()
* For CSV:- Dataset.to_csv()
* For JSO:- Dataset.to_json()
* For Parquet:- Dataset.to_aprquet()

In [1]:
# when we download or process a dataset , the data is stored in a cache directory
from datasets import load_dataset

In [2]:
raw_dataset = load_dataset("allocine")
raw_dataset.cache_files

Downloading readme:   0%|          | 0.00/9.31k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.58M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.58M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20000 [00:00<?, ? examples/s]

{'train': [{'filename': 'C:\\Users\\rajkr\\.cache\\huggingface\\datasets\\allocine\\allocine\\0.0.0\\a4654f4896408912913a62ace89614879a549287\\allocine-train.arrow'}],
 'validation': [{'filename': 'C:\\Users\\rajkr\\.cache\\huggingface\\datasets\\allocine\\allocine\\0.0.0\\a4654f4896408912913a62ace89614879a549287\\allocine-validation.arrow'}],
 'test': [{'filename': 'C:\\Users\\rajkr\\.cache\\huggingface\\datasets\\allocine\\allocine\\0.0.0\\a4654f4896408912913a62ace89614879a549287\\allocine-test.arrow'}]}

In [3]:
# saving and realoding the data to the disk

In [4]:
#for daving inside the disk
from datasets import load_dataset
raw_dataset = load_dataset("allocine")
raw_dataset.save_to_disk("my-arrow-datasets")

Saving the dataset (0/1 shards):   0%|          | 0/160000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [5]:
# for loading the dataset from the disk
from datasets import load_from_disk
arrow_datasets_reloaded = load_from_disk("my-arrow-datasets")
arrow_datasets_reloaded

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
})

In [6]:
# saving and loading the dataset in CSV Format

In [7]:
from datasets import load_dataset
raw_dataset = load_dataset("allocine")

for split,dataset in raw_dataset.items():
    dataset.to_csv(f"my-dataset-{split}.csv",index=None)

Creating CSV from Arrow format:   0%|          | 0/160 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

In [8]:
#loading
from datasets import load_dataset
data_files={
    "train":"my-dataset-train.csv",
    "validation":"my-dataset-validation.csv",
    "test":"my-dataset-test.csv",
}
csv_datasets_reloaded = load_dataset("csv",data_files=data_files)
csv_datasets_reloaded

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label'],
        num_rows: 160000
    })
    validation: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['review', 'label'],
        num_rows: 20000
    })
})