# Reading Library

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
dataset = load_dataset("climatebert/climate_sentiment")

# Convert the train split to a Pandas DataFrame
train_df = dataset['train'].to_pandas()

# Convert the test split to a Pandas DataFrame
test_df = dataset['test'].to_pandas()

# Concatenate train and test dataframes
dataframe = pd.concat([train_df, test_df], ignore_index=True)
dataframe.columns = ['message', 'sentiment']
print(dataframe.head())
print(dataframe.keys())

                                             message  sentiment
0  − Scope 3: Optional scope that includes indire...          1
1  The Group is not aware of any noise pollution ...          0
2  Global climate change could exacerbate certain...          0
3  Setting an investment horizon is part and parc...          0
4  Climate change the physical impacts of climate...          0
Index(['message', 'sentiment'], dtype='object')


# Data Preprocessing

In [2]:
import re

def clean_message(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply the cleaning function to the "message" column
dataframe['message'] = dataframe['message'].apply(clean_message)

def tokenize_message(text):
    return word_tokenize(text)

dataframe['tokenized_message'] = dataframe['message'].apply(tokenize_message)

# Remove stop words
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Change 'english' to your language if needed

def remove_stopwords(tokenized_text):
    return [word for word in tokenized_text if word.lower() not in stop_words]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(remove_stopwords)

# Convert all words to lowercase
def convert_to_lowercase(tokenized_text):
    return [word.lower() for word in tokenized_text]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(convert_to_lowercase)

dataframe = dataframe[['sentiment', 'tokenized_message']]

X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)

# Data Splitting

In [3]:
X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)

# LogisticRegression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logistic_regression = LogisticRegression()
# Train the model on the training data
logistic_regression.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_lr = logistic_regression.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_lr = logistic_regression.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
validation_accuracy_lr = accuracy_score(y_valid, y_valid_pred_lr)

print("Logistic Regression Training Accuracy:", training_accuracy_lr)
print("Logistic Regression Validation Accuracy:", validation_accuracy_lr)

Logistic Regression Training Accuracy: 0.9545454545454546
Logistic Regression Validation Accuracy: 0.7727272727272727


In [7]:
# The l1 penalty encourages sparsity in the model by adding a penalty term that
# encourages many feature coefficients to be exactly zero. This can be useful when
# you suspect that many features are irrelevant or redundant.
# The l2 penalty adds a penalty term based on the square of the coefficients' magnitudes.
# It discourages coefficients from becoming too large, which helps prevent overfitting.

# A smaller C value, such as 0.1, increases the strength of regularization.
# In other words, it adds a stronger penalty for large coefficient values.
# This can help prevent overfitting by keeping the model's coefficients smaller.

logistic_regression = LogisticRegression(penalty='l2', C=0.1, max_iter=1000)

# Train the model on the training data
logistic_regression.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_lr = logistic_regression.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_lr = logistic_regression.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
validation_accuracy_lr = accuracy_score(y_valid, y_valid_pred_lr)

print("Logistic Regression Training Accuracy:", training_accuracy_lr)
print("Logistic Regression Validation Accuracy:", validation_accuracy_lr)

# We successfully reduced overfitting; however, since the dataset has 3 levels (negative, neutral, positive),
# and logistic regression is for binary decision, it may not perform optimally.

Logistic Regression Training Accuracy: 0.6732954545454546
Logistic Regression Validation Accuracy: 0.6136363636363636


# Limitation

Initially, the Logistic Regression model exhibited strong performance with a training accuracy of 95.45% and a validation accuracy of 77.27%. However, when applied to the sentiment analysis task with three levels (negative, neutral, positive), Logistic Regression struggled to achieve optimal results. This is because Logistic Regression is inherently designed for binary classification, making it less suitable for multi-class problems like ours. The model's performance dropped significantly after tuning, with a training accuracy of 67.33% and a validation accuracy of 61.36%. This decrease in accuracy suggests that despite efforts to reduce overfitting, Logistic Regression may not capture the nuances of multi-class sentiment analysis effectively.