In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize

In [2]:
# Load the dataset
df = pd.read_csv('text_emotion.csv')

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply preprocessing to the content column
df['content'] = df['content'].apply(preprocess_text)

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply stopword removal and lemmatization to the content column
df['processed_content'] = df['content'].apply(remove_stopwords_and_lemmatize)

df

Unnamed: 0,tweet_id,sentiment,author,content,processed_content
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,know listenin bad habit earlier started freaki...
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,funeral ceremonygloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon,want hang friend soon
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,want trade someone houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,,
39996,1753919001,love,drapeaux,happy mothers day all my love,happy mother day love
39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,happy mother day mommy woman man long youre mo...
39998,1753919043,happiness,ipdaman1,wassup beautiful follow me peep out my new h...,wassup beautiful follow peep new hit single de...


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch

# Check GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Preparing the dataset for GPT-2
class EmotionDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', 
                                       truncation=True, 
                                       max_length=max_length, 
                                       padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

# Instantiate the dataset
dataset = EmotionDataset(df['processed_content'].tolist(), tokenizer, max_length=256)

# Train/test split
train_size = int(0.9 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

# Setting up the optimizer and scheduler
epochs = 4
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Fine-tuning the model
epochs = 4
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        b_input_ids, b_attn_mask = batch
        b_input_ids = b_input_ids.to(device)
        b_attn_mask = b_attn_mask.to(device)
        
        outputs = model(b_input_ids, 
                        attention_mask=b_attn_mask,
                        labels=b_input_ids,
                        return_dict=True)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    print(f'Epoch {epoch} finished')

model.save_pretrained('your_model_directory')



KeyboardInterrupt: 

In [None]:
# Evaluate the Model
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import numpy as np

# Convert true labels and predictions to numpy arrays
true_labels = np.array(true_labels)
predictions = np.array(predictions)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(true_labels, predictions)

# Compute Pearson Correlation Coefficient
pearson_corr, _ = pearsonr(true_labels, predictions)

# Compute Concordance Correlation Coefficient (CCC)
mean_true = np.mean(true_labels)
mean_pred = np.mean(predictions)
var_true = np.var(true_labels)
var_pred = np.var(predictions)
sd_true = np.std(true_labels)
sd_pred = np.std(predictions)

numerator = 2 * np.cov(true_labels, predictions)[0][1]
denominator = var_true + var_pred + (mean_true - mean_pred) ** 2
ccc = numerator / denominator

print(f"Mean Squared Error (MSE): {mse}")
print(f"Pearson Correlation Coefficient: {pearson_corr}")
print(f"Concordance Correlation Coefficient (CCC): {ccc}")

In [None]:
# Predict Emotion Intensity
def predict_emotion_intensity(model, tokenizer, text, max_length=256):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)  
    preprocessed_text = remove_stopwords_and_lemmatize(preprocessed_text)  
    
    # Tokenize and encode the text
    encoded_input = tokenizer('<|startoftext|>' + preprocessed_text + '<|endoftext|>', 
                              truncation=True, 
                              max_length=max_length, 
                              padding="max_length", 
                              return_tensors='pt')
    
    # Move tensor to the same device as the model
    encoded_input = {key: val.to(model.device) for key, val in encoded_input.items()}
    
    # Generate prediction
    with torch.no_grad():
        outputs = model(**encoded_input)
    
    # Process outputs to generate a prediction for emotion intensity
    # Might need a separate regression head or some form of processing to convert
    # model outputs to an emotion intensity score.
    emotion_intensity = process_model_output(outputs)
    
    return emotion_intensity

model = GPT2LMHeadModel.from_pretrained('model_directory').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
text = "I'm feeling great today!"
emotion_intensity = predict_emotion_intensity(model, tokenizer, text)
print(f"Predicted emotion intensity: {emotion_intensity}")
