# Build Custom Data
- In this file we will see how to make a custom dataset using pytorh that will pass to the model for training.
- But we can make a custom data for textual data

# Import Packages

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets
import pandas as pd
import numpy as np

# Load dataset
- In this file we will use the emotion classification dataset.
https://www.kaggle.com/datasets/nelgiriyewithana/emotions
- Data contain 6 categories
`Six categories:`

*  `sadness (0`)
*   `joy (1)`
*   `love (2)`
*   `anger (3)`
*  ` fear (4)`
*  ` surprise (5)`

In [2]:
# Download dataset from Kaggle
!kaggle datasets download -d nelgiriyewithana/emotions

Dataset URL: https://www.kaggle.com/datasets/nelgiriyewithana/emotions
License(s): other
Downloading emotions.zip to /content
 57% 9.00M/15.7M [00:00<00:00, 64.4MB/s]
100% 15.7M/15.7M [00:00<00:00, 95.4MB/s]


In [3]:
# Unzip the downloaded data
!unzip emotions.zip

Archive:  emotions.zip
  inflating: text.csv                


In [4]:
df = pd.read_csv("/content/text.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [5]:
df.drop("Unnamed: 0",axis=1,inplace=True)
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [6]:
df = df.sample(10000)

In [7]:
df.isna().sum()

Unnamed: 0,0
text,0
label,0


In [8]:
df.duplicated().sum()

0

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

In [11]:
df.reset_index(drop=True,inplace=True)

In [12]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3323
0,2878
3,1443
4,1118
2,865
5,373


# Observation
- We can remove the duplicate value.
- Now we can use the transformer tokenizer for data preprocessing now we can load the tokenizer from hugging face.
- we can also convert the data to hugging face dataset b/c model can easy for working with hugging face dataset

In [13]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Test the tokenizer

In [14]:
df['text'][0]

'i feel a little bit sorrowful'

In [15]:
encoded_text = tokenizer(df['text'][0])
encoded_text

{'input_ids': [101, 1045, 2514, 1037, 2210, 2978, 14038, 3993, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
# decode the text
tokenizer.decode(encoded_text["input_ids"])

'[CLS] i feel a little bit sorrowful [SEP]'

In [17]:
# make fun for text processing
def tokenize_text(batch):
    tokenize_text = tokenizer(batch['text'],padding="max_length",max_length=300,truncation=True,return_tensors="pt")
    label = batch['label']
    return {
        "input_ids":tokenize_text["input_ids"].squeeze(),
        "attention_mask":tokenize_text["attention_mask"].squeeze(),
        "label":torch.tensor(label ,dtype=torch.long)
    }

In [18]:
tokenize_text(df.iloc[0])

{'input_ids': tensor([  101,  1045,  2514,  1037,  2210,  2978, 14038,  3993,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

# Observation
- We can test the tokenizer it can working fine now we can make our dataset

In [19]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # 80% train, 20% test

print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 8000
Test set size: 2000


In [20]:
train_df

Unnamed: 0,text,label
9254,i start to feel nostalgic for things that neve...,2
1561,i feel dull right now,0
1670,i feel glamorous rich enough for enriching my ...,1
6087,i believe there is a difference between maskin...,0
6669,i feel like being a whiney bitch but hey thats...,0
...,...,...
5734,i feel so fucked up because im so free and yet...,3
5191,i also feel that it s more gentle to baby s sk...,2
5390,i feel deeply triumphant,1
860,i get older i tend to remember this feeling th...,3


In [211]:
dda= train_df['text']

In [32]:
class CustomDataset(Dataset):
    def __init__(self, data,tokenizer):
        self.df = data.reset_index(drop=True)
        self.text = self.df["text"]
        self.labels = self.df["label"]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        # Get the text and label form dataframe
        text = self.text[idx]
        label = self.labels[idx]

        tokenize_text = self.tokenizer(text,padding="max_length",max_length=300,truncation=True,return_tensors="pt")

        return {
           "input_ids":tokenize_text["input_ids"].squeeze(),
           "attention_mask":tokenize_text["attention_mask"].squeeze(),
           "label":torch.tensor(label,dtype=torch.long)
        }


In [33]:
CustomDataset(data=train_df,tokenizer=tokenizer)

<__main__.CustomDataset at 0x79da56e82770>

In [34]:
# Train data
train_data = CustomDataset(data=train_df,tokenizer=tokenizer)

# Test data
test_data = CustomDataset(data=test_df,tokenizer=tokenizer)

train_df,test_df

(                                                   text  label
 9254  i start to feel nostalgic for things that neve...      2
 1561                              i feel dull right now      0
 1670  i feel glamorous rich enough for enriching my ...      1
 6087  i believe there is a difference between maskin...      0
 6669  i feel like being a whiney bitch but hey thats...      0
 ...                                                 ...    ...
 5734  i feel so fucked up because im so free and yet...      3
 5191  i also feel that it s more gentle to baby s sk...      2
 5390                           i feel deeply triumphant      1
 860   i get older i tend to remember this feeling th...      3
 7270  i never thought i would feel more passionate a...      2
 
 [8000 rows x 2 columns],
                                                    text  label
 6252  i guess we know how betty feels about your bel...      1
 4684  i have panic attacks in water when i feel stre...      3
 1731  i nee

In [35]:
def custom_collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}


# Make a DataLoader
- Using `dataloader` we can efficently load the data.

In [36]:
# train Loader
train_loader = DataLoader(train_data,batch_size=8,shuffle=True,collate_fn=custom_collate_fn)

# test Loader
test_loader = DataLoader(test_data,batch_size=8,shuffle=False,collate_fn=custom_collate_fn)

In [39]:
# dispaly the batch
for batch in train_loader:
    print(batch)
    break

{'input_ids': tensor([[ 101, 1045, 2424,  ...,    0,    0,    0],
        [ 101, 1045, 2318,  ...,    0,    0,    0],
        [ 101, 1045, 2514,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2428,  ...,    0,    0,    0],
        [ 101, 1045, 2572,  ...,    0,    0,    0],
        [ 101, 1045, 2215,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([1, 3, 1, 2, 3, 0, 4, 1])}


# Observation
- Now we can successfully make our custom data and we will directly pass the data to model for training.