# Step 1: Import data set

    - You can download the Tweet emotion intensity dataset from Hugging Face into your environment.

    - Import the file and print out the first few lines of it.

The following code snippet will help you load the dataset:

In [1]:
import pandas as pd

splits = {'train': 'train.csv', 'test': 'test.csv'}
data = pd.read_csv("hf://datasets/stepp1/tweet_emotion_intensity/" + splits["train"])
test_data = pd.read_csv("hf://datasets/stepp1/tweet_emotion_intensity/" + splits["test"])
data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,tweet,class,sentiment_intensity,class_intensity,labels
0,40815,Loved @Bethenny independence msg on @WendyWill...,fear,low,fear_low,4
1,10128,@mark_slifer actually maybe we were supposed t...,sadness,high,sadness_high,9
2,40476,I thought the nausea and headaches had passed ...,fear,medium,fear_medium,5
3,20813,"Anger, resentment, and hatred are the destroye...",anger,high,anger_high,0
4,40796,new tires &amp; an alarm system on my car. fwm...,fear,low,fear_low,4


# step 2: clean data:
This step is cleaning the raw text data to remove unnecessary characters, such as URLs, special symbols, or HTML tags, and to normalize the text by converting it to lowercase. 

Make a new column called cleanedText that is equal to the data in the Tweet column that has had this cleanedText function applied to it.

In [2]:
import re

def clean_data(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text) # Remove URLs from the text
    text = re.sub(r"<.*?>", '', text) # Remove any HTML tags from the text
    text = re.sub(r"[^\w\s]", '',text) # Remove punctuation, keep only words and spaces
    return text

data['cleaned_text'] = data['tweet'].apply(clean_data)

data.head()


Unnamed: 0,id,tweet,class,sentiment_intensity,class_intensity,labels,cleaned_text
0,40815,Loved @Bethenny independence msg on @WendyWill...,fear,low,fear_low,4,loved bethenny independence msg on wendywillia...
1,10128,@mark_slifer actually maybe we were supposed t...,sadness,high,sadness_high,9,mark_slifer actually maybe we were supposed to...
2,40476,I thought the nausea and headaches had passed ...,fear,medium,fear_medium,5,i thought the nausea and headaches had passed ...
3,20813,"Anger, resentment, and hatred are the destroye...",anger,high,anger_high,0,anger resentment and hatred are the destroyer ...
4,40796,new tires &amp; an alarm system on my car. fwm...,fear,low,fear_low,4,new tires amp an alarm system on my car fwm now


# step 3: handle missing data
We now handle missing or incomplete data in your dataset. You can either remove rows with missing data or fill them with placeholders, ensuring the dataset is complete for training. 

In [3]:
print(data.isnull().sum())

id                     0
tweet                  0
class                  0
sentiment_intensity    0
class_intensity        0
labels                 0
cleaned_text           0
dtype: int64


# step 4: tokenizer
After cleaning the text, we tokenize it. Tokenization splits the text into individual words or subwords that can be used by the model. We will use the BERT tokenizer to ensure compatibility with the Brie-trained model you are fine-tuning. 

In [5]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

token = tokenizer(
    data['cleaned_text'].tolist(), 
    padding=True,
    truncation=True, 
    max_length=128, 
    return_tensors='pt'
)

print(token['input_ids'][:5])

tensor([[  101,  3866,  7014,  2368,  4890,  4336,  5796,  2290,  2006, 12815,
         29602,  6632,  5244,  2022,  3407, 23713, 16829,  2306,  4426, 23713,
         13433, 28032,  7730,  2097, 19311,  2000,  2017,  3407,  2981,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  2928,  1035, 22889, 23780,  2941,  2672,  2057,  2020,  4011,
          2000,  3280,  1998,  2026, 13445,  5552,  2256,  3268, 27451,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2245,  1996, 19029,  1998, 14978,  2015,  2018,  2979,
          2021,  8840,  2140,  1045,  2514,  9643,  2651,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

We load the BERT tokenizer from the Transformers library from HuggingFace, and tokenize the cleanedText that we defined earlier. Then, we can print the tokens of the input IDs. The words have been converted into these numbered tokens. 

In certain cases, especially when data is limited, data augmentation techniques can be applied to generate new training examples by modifying the original dataset.

    Paraphrasing: rewriting sentences in different ways while preserving the meaning

    Backtranslation: translating text into another language and back again to create variation

    Synonym replacement: replacing certain words in the text with their synonyms

## Augmentation

The following example demonstrates how to implement synonym replacement using the nltk library. It randomly replaces words in the text with their synonyms to create new variations of sentences. This method can be applied when paraphrasing or backtranslation is not feasible.

In [None]:
# Import necessary modules
import random # Random module for generating random numbers and selections
import nltk # NLTK's WordNet corpus for finding synonyms
nltk.download('wordnet')
# Define a function to find and replace a word with a synonym
def synonym_replacement(word):
# Get all synsets (sets of synonyms) for the given word from WordNet
    synonyms = wordnet.synsets(word)

# If the word has synonyms, randomly choose one synonym, otherwise return the original word
    if synonyms:
# Select a random synonym and get the first lemma (word form) of that synonym
        return random.choice(synonyms).lemmas()[0].name()

# If no synonyms are found, return the original word
    return word

# Define a function to augment text by replacing words with synonyms randomly
def augment_text(text):
# Split the input text into individual words
    words = text.split() # Split the input text into individual words

# Replace each word with a synonym with a probability of 20% (random.random() > 0.8)
    augmented_words = [
    synonym_replacement(word) if random.random() > 0.8 else word 
# If random condition met, replace
for word in words] # Iterate over each word in the original text

# Join the augmented words back into a single string and return it
    return ' '.join(augmented_words)

# Apply the text augmentation function to the 'cleaned_text' column in a DataFrame
# Create a new column 'augmented_text' containing the augmented version of 'cleaned_text'
data['augmented_text'] = data['cleaned_text'].apply(augment_text)

# step 5: structure of fine tunning  

In [13]:
import torch 
from torch.utils.data import TensorDataset, DataLoader

input_ids = token['input_ids']
attention_mask = token['attention_mask'] 

def map_sentiment(value):
    if value == "high":
        return 1
    elif value == "medium":
        return 0.5
    elif value == "low":
        return 0
    else:
        return None  # Handle unexpected values, if any

# Apply the function to each item in 'sentiment_intensity'
data['sentiment_intensity'] = data['sentiment_intensity'].apply(map_sentiment)

# Drop any rows where 'sentiment_intensity' is None
data = data.dropna(subset=['sentiment_intensity']).reset_index(drop=True)

# Convert the 'sentiment_intensity' column to a tensor
labels = torch.tensor(data['sentiment_intensity'].tolist())

# step 6: split data

In [15]:
from sklearn.model_selection import train_test_split # Import function to split dataset

# First split: 15% for test set, the rest for training/validation
train_val_inputs, test_inputs, train_val_masks, test_masks, train_val_labels, test_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.15, random_state=42
)

# Second split: 20% for validation set from remaining data
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_val_inputs, train_val_masks, train_val_labels, test_size=0.2, random_state=42
)

# Create TensorDataset objects for each set, including attention masks
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

print("Training, validation, and test sets are prepared with attention masks!")

Training, validation, and test sets are prepared with attention masks!
