In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize

In [4]:
# Load the dataset
df = pd.read_csv('text_emotion.csv')

# Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

# Apply preprocessing to the content column
df['content'] = df['content'].apply(preprocess_text)

# Stopword removal and lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply stopword removal and lemmatization to the content column
df['processed_content'] = df['content'].apply(remove_stopwords_and_lemmatize)

df

Unnamed: 0,tweet_id,sentiment,author,content,processed_content
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,know listenin bad habit earlier started freaki...
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,funeral ceremonygloomy friday
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon,want hang friend soon
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,want trade someone houston ticket one
...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,,
39996,1753919001,love,drapeaux,happy mothers day all my love,happy mother day love
39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,happy mother day mommy woman man long youre mo...
39998,1753919043,happiness,ipdaman1,wassup beautiful follow me peep out my new h...,wassup beautiful follow peep new hit single de...


# VADER with Linear Regression


The VADER (Valence Aware Dictionary and sEntiment Reasoner) score is a sentiment analysis tool that is specifically attuned to sentiments expressed in social media. It provides a compound score that aggregates the cumulative sentiment of a text, ranging from -1 (most extreme negative) to +1 (most extreme positive). 

In [5]:
# Feature Engineering
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features
tfidf_features = tfidf_vectorizer.fit_transform(df['content'])

# Sentiment Intensity Scores using VADER
analyzer = SentimentIntensityAnalyzer()
df['vader_score'] = df['content'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Combine TF-IDF features with VADER scores
features = np.hstack((tfidf_features.toarray(), df['vader_score'].values.reshape(-1, 1)))

# Now 'features' contains the TF-IDF vectors with VADER sentiment scores appended to each vector
df

Unnamed: 0,tweet_id,sentiment,author,content,processed_content,vader_score
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,know listenin bad habit earlier started freaki...,-0.5423
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhhwaitin on y...,layin n bed headache ughhhhwaitin call,0.0000
2,1956967696,sadness,coolfunky,funeral ceremonygloomy friday,funeral ceremonygloomy friday,-0.3612
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon,want hang friend soon,0.4767
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,want trade someone houston ticket one,-0.3919
...,...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,,,0.0000
39996,1753919001,love,drapeaux,happy mothers day all my love,happy mother day love,0.8360
39997,1753919005,love,JenniRox,happy mothers day to all the mommies out there...,happy mother day mommy woman man long youre mo...,0.5719
39998,1753919043,happiness,ipdaman1,wassup beautiful follow me peep out my new h...,wassup beautiful follow peep new hit single de...,0.5994


In [10]:
# Function to convert categorical to numerical for regression
emotion_intensity_map = {
    'empty': 0,
    'boredom': 1,
    'neutral': 2,
    'relief': 3,
    'surprise': 4,
    'fun': 5,
    'worry': 6,
    'sadness': 7,
    'happiness': 8,
    'love': 9,
    'enthusiasm': 10,
    'anger': 11,
    'hate': 12
}

def convert_categorical_to_numerical(sentiment):
    return emotion_intensity_map.get(sentiment, -1)

df['sentiment_intensity'] = df['sentiment'].apply(convert_categorical_to_numerical)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(features, df['sentiment_intensity'], test_size=0.2, random_state=42)
model = LinearRegression()


In [11]:
model.fit(X_train, y_train)


LinearRegression()

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 7.283080635248068
R^2 Score: 0.08335281671616335


In [15]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

# Function to calculate Concordance Correlation Coefficient
def concordance_correlation_coefficient(y_true, y_pred):
    pearson_corr = pearsonr(y_true, y_pred)[0]
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    sd_true = np.sqrt(var_true)
    sd_pred = np.sqrt(var_pred)
    numerator = 2 * pearson_corr * sd_true * sd_pred
    denominator = var_true + var_pred + (mean_true - mean_pred) ** 2
    return numerator / denominator

# Make predictions
predictions = model.predict(X_test)
true_labels = y_test

# Calculate evaluation metrics
mse = mean_squared_error(true_labels, predictions)
pearson_corr, _ = pearsonr(true_labels, predictions)
ccc = concordance_correlation_coefficient(true_labels, predictions)

print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Pearson Correlation Coefficient: {pearson_corr:.4f}')
print(f'Concordance Correlation Coefficient (CCC): {ccc:.4f}')

Mean Squared Error (MSE): 7.2831
Pearson Correlation Coefficient: 0.3030
Concordance Correlation Coefficient (CCC): 0.2068


In [13]:
# Prediction
new_data = ["I'm so happy to see you!"]
new_data_processed = tfidf_vectorizer.transform(new_data)
new_data_vader_score = analyzer.polarity_scores(new_data[0])['compound']
new_data_features = np.hstack((new_data_processed.toarray(), [[new_data_vader_score]]))

predicted_intensity = model.predict(new_data_features)
print(f'Predicted Emotion Intensity: {predicted_intensity[0]}')


Predicted Emotion Intensity: 6.736018505782546
