In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from scipy.stats import ttest_ind

import re
from transformers import ElectraTokenizer

%matplotlib inline

In [4]:
# Loading the dataset
df = pd.read_csv('../data/clean/df_0.csv', sep=',')

In [5]:
# Preprocess the dataset: removing URLs, mentions and hashtags im 'text' column and convert to lowercase

# Define a function to clean the text
def clean_text(text):
    # Remove URLs, mentions, hashtags, and punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the text cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

# Inspect the cleaned text column
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Cooking microwave pizzas, yummy","cooking microwave pizzas, yummy"
1,Any plans of allowing sub tasks to show up in ...,any plans of allowing sub tasks to show up in ...
2,"I love the humor, I just reworded it. Like sa...","i love the humor, i just reworded it. like sa..."
3,naw idk what ur talkin about,naw idk what ur talkin about
4,That sucks to hear. I hate days like that,that sucks to hear. i hate days like that


<br>
<br>

# Tokenize the data using ELECTRA's Tokenizer
Convert words to tokens that map to ELECTRA’s vocabulary.

In [7]:
# Load the pre-trained ELECTRA tokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator', clean_up_tokenization_spaces=True)

In [13]:
# Tokenize the dataset
# The tokenizer converts the cleaned text into input IDs and attention masks.

def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=128)

# Apply the tokenizer to your dataset
tokenized_data = df['cleaned_text'].apply(tokenize_function)

# Inspect the tokenized data
tokenized_data.head()

0    [input_ids, token_type_ids, attention_mask]
1    [input_ids, token_type_ids, attention_mask]
2    [input_ids, token_type_ids, attention_mask]
3    [input_ids, token_type_ids, attention_mask]
4    [input_ids, token_type_ids, attention_mask]
Name: cleaned_text, dtype: object

In [15]:
# Check the first tokenized example and inspect its components
first_tokenized_example = tokenized_data.iloc[0]
print(first_tokenized_example['input_ids']) # represent the tokenized version of the text, each number corresponds to a specific token (word or subword) in the model’s vocabulary
print(first_tokenized_example['attention_mask']) #  tells the model which tokens in the input sequence are actual tokens and which ones are padding.

[101, 8434, 18302, 10733, 2015, 1010, 9805, 18879, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


<br>
<br>
# Fine-tune ELECTRA for Sentiment Analysis