In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
import nltk
from nltk.tokenize import word_tokenize

In [12]:
# Load the dataset
df = pd.read_csv('text_emotion.csv')

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply preprocessing to the content column
df['content'] = df['content'].apply(preprocess_text)

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply stopword removal and lemmatization to the content column
df['processed_content'] = df['content'].apply(remove_stopwords_and_lemmatize)

df

Unnamed: 0,tweet_id,sentiment,author,content,processed_content
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,know listenin bad habit earlier started freaki...
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,funeral ceremonygloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon,want hang friend soon
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,want trade someone houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,,
39996,1753919001,love,drapeaux,happy mothers day all my love,happy mother day love
39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,happy mother day mommy woman man long youre mo...
39998,1753919043,happiness,ipdaman1,wassup beautiful follow me peep out my new h...,wassup beautiful follow peep new hit single de...


In [13]:
# Feature Engineering
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features
tfidf_features = tfidf_vectorizer.fit_transform(df['content'])

# Sentiment Intensity Scores using VADER
analyzer = SentimentIntensityAnalyzer()
df['vader_score'] = df['content'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Combine TF-IDF features with VADER scores
features = np.hstack((tfidf_features.toarray(), df['vader_score'].values.reshape(-1, 1)))

# Now 'features' contains the TF-IDF vectors with VADER sentiment scores appended to each vector
features

In [21]:
df

Unnamed: 0,tweet_id,sentiment,author,content,processed_content,vader_score,sentiment_intensity
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,know listenin bad habit earlier started freaki...,-0.5423,0
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,layin n bed headache ughhhhwaitin call,0.0000,7
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,funeral ceremonygloomy friday,-0.3612,7
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon,want hang friend soon,0.4767,10
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,want trade someone houston ticket one,-0.3919,2
...,...,...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,,,0.0000,2
39996,1753919001,love,drapeaux,happy mothers day all my love,happy mother day love,0.8360,9
39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,happy mother day mommy woman man long youre mo...,0.5719,9
39998,1753919043,happiness,ipdaman1,wassup beautiful follow me peep out my new h...,wassup beautiful follow peep new hit single de...,0.5994,8


In [22]:
unique_sentiments = df['sentiment'].unique()
print(unique_sentiments)

['empty' 'sadness' 'enthusiasm' 'neutral' 'worry' 'surprise' 'love' 'fun'
 'hate' 'happiness' 'boredom' 'relief' 'anger']


In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to convert categorical to numerical for regression
emotion_intensity_map = {
    'empty': 0,
    'boredom': 1,
    'neutral': 2,
    'relief': 3,
    'surprise': 4,
    'fun': 5,
    'worry': 6,
    'sadness': 7,
    'happiness': 8,
    'love': 9,
    'enthusiasm': 10,
    'anger': 11,
    'hate': 12
}

def convert_categorical_to_numerical(sentiment):
    return emotion_intensity_map.get(sentiment, -1)

df['sentiment_intensity'] = df['sentiment'].apply(convert_categorical_to_numerical)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df['processed_content'], df['sentiment_intensity'], test_size=0.2, random_state=42)

# Define dataset parameters
MAX_LEN = 128  
BATCH_SIZE = 16  

# Create datasets
train_dataset = EmotionDataset(X_train.to_numpy(), y_train.to_numpy(), tokenizer, MAX_LEN)
val_dataset = EmotionDataset(X_val.to_numpy(), y_val.to_numpy(), tokenizer, MAX_LEN)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # num_labels=1 for regression

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Training loop
EPOCHS = 3  # Adjust as needed
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = torch.nn.functional.mse_loss(outputs.logits.squeeze(-1), labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Validation loop
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = torch.nn.functional.mse_loss(outputs.logits.squeeze(-1), labels)
            val_losses.append(loss.item())
    
    # Calculate validation loss
    val_loss = np.mean(val_losses)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss:.4f}')

model.save_pretrained('./emotion_intensity_model')

Epoch 1/3, Validation Loss: 7.1033
Epoch 2/3, Validation Loss: 7.1712
Epoch 3/3, Validation Loss: 7.4551


In [20]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

# Function to calculate Concordance Correlation Coefficient
def concordance_correlation_coefficient(y_true, y_pred):
    pearson_corr = pearsonr(y_true, y_pred)[0]
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    sd_true = np.sqrt(var_true)
    sd_pred = np.sqrt(var_pred)
    numerator = 2 * pearson_corr * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred) ** 2
    return numerator / denominator

# Evaluation loop
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device).cpu().numpy()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits.squeeze(-1).cpu().numpy()
        
        predictions.extend(logits)
        true_labels.extend(labels)

# Convert to numpy arrays
predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Calculate evaluation metrics
mse = mean_squared_error(true_labels, predictions)
pearson_corr, _ = pearsonr(true_labels, predictions)
ccc = concordance_correlation_coefficient(true_labels, predictions)

print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Pearson Correlation Coefficient: {pearson_corr:.4f}')
print(f'Concordance Correlation Coefficient (CCC): {ccc:.4f}')


Mean Squared Error (MSE): 7.4551
Pearson Correlation Coefficient: 0.3318
Concordance Correlation Coefficient (CCC): 0.2789
