In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from scipy.stats import ttest_ind

import re
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast
import torch


%matplotlib inline


In [26]:
# Loading the dataset
df = pd.read_csv('../data/clean/df_0.csv', sep=',')

In [27]:
# Preprocess the dataset: removing URLs, mentions and hashtags im 'text' column and convert to lowercase

# Define a function to clean the text
def clean_text(text):
    # Remove URLs, mentions, hashtags, and punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the text cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

# Inspect the cleaned text column
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Cooking microwave pizzas, yummy","cooking microwave pizzas, yummy"
1,Any plans of allowing sub tasks to show up in ...,any plans of allowing sub tasks to show up in ...
2,"I love the humor, I just reworded it. Like sa...","i love the humor, i just reworded it. like sa..."
3,naw idk what ur talkin about,naw idk what ur talkin about
4,That sucks to hear. I hate days like that,that sucks to hear. i hate days like that


<br>
<br>

# Tokenize the data using ELECTRA's Tokenizer
Convert words to tokens that map to ELECTRA’s vocabulary.

In [29]:
# Load the pre-trained ELECTRA tokenizer
tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator', clean_up_tokenization_spaces=True)


In [31]:
# Tokenize the cleaned data
tokenized_data = tokenizer(
    df['cleaned_text'].tolist(),  # List of sentences to tokenize
    padding=True,                # Pads sequences to the longest sequence in the batch
    truncation=True,             # Truncates longer sequences to the max_length
    max_length=128,              # Sets the maximum length for tokenized sequences
    return_tensors='pt'          # Return as PyTorch tensors
)

In [34]:
# Inspect the tokenized data

# Check the first tokenized example
print("Input IDs (first example):", tokenized_data['input_ids'][0])
print("Attention Mask (first example):", tokenized_data['attention_mask'][0])

# Check the length of the input_ids and attention_mask (should be 128 if max_length=128 was set)
print("Length of Input IDs:", len(tokenized_data['input_ids'][0]))
print("Length of Attention Mask:", len(tokenized_data['attention_mask'][0]))


Input IDs (first example): tensor([  101,  8434, 18302, 10733,  2015,  1010,  9805, 18879,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  

<br>
<br>

# Fine-tune ELECTRA for Sentiment Analysis
ELECTRA is pre-trained on general text but needs to be fine-tuned to classify sentiments based on my dataset.