# Reading Library

In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
dataset = load_dataset("climatebert/climate_sentiment")

# Convert the train split to a Pandas DataFrame
train_df = dataset['train'].to_pandas()

# Convert the test split to a Pandas DataFrame
test_df = dataset['test'].to_pandas()

# Concatenate train and test dataframes
dataframe = pd.concat([train_df, test_df], ignore_index=True)
dataframe.columns = ['message', 'sentiment']
print(dataframe.head())
print(dataframe.keys())

                                             message  sentiment
0  − Scope 3: Optional scope that includes indire...          1
1  The Group is not aware of any noise pollution ...          0
2  Global climate change could exacerbate certain...          0
3  Setting an investment horizon is part and parc...          0
4  Climate change the physical impacts of climate...          0
Index(['message', 'sentiment'], dtype='object')


# Data Preprocessing

In [4]:
import re

def clean_message(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply the cleaning function to the "message" column
dataframe['message'] = dataframe['message'].apply(clean_message)

def tokenize_message(text):
    return word_tokenize(text)

dataframe['tokenized_message'] = dataframe['message'].apply(tokenize_message)

# Remove stop words
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Change 'english' to your language if needed

def remove_stopwords(tokenized_text):
    return [word for word in tokenized_text if word.lower() not in stop_words]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(remove_stopwords)

# Convert all words to lowercase
def convert_to_lowercase(tokenized_text):
    return [word.lower() for word in tokenized_text]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(convert_to_lowercase)

dataframe = dataframe[['sentiment', 'tokenized_message']]

X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)

# Data Splitting

In [5]:
X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)

# Naïve Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initialize the Naive Bayes model
naive_bayes = MultinomialNB()

# Train the model on the training data
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_nb = naive_bayes.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_nb = naive_bayes.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_nb = accuracy_score(y_train, y_train_pred_nb)
validation_accuracy_nb = accuracy_score(y_valid, y_valid_pred_nb)

print("Naive Bayes Training Accuracy:", training_accuracy_nb)
print("Naive Bayes Validation Accuracy:", validation_accuracy_nb)

Naive Bayes Training Accuracy: 0.8787878787878788
Naive Bayes Validation Accuracy: 0.7083333333333334


In [8]:
from sklearn.model_selection import GridSearchCV

# Initialize the Naive Bayes model
naive_bayes = MultinomialNB()

# Define a range of alpha values to try
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'fit_prior': [True, False],
    'class_prior': [None, [0.3, 0.4, 0.3]]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters from grid search
best_alpha = grid_search.best_params_['alpha']
best_fit_prior = grid_search.best_params_['fit_prior']
best_class_prior = grid_search.best_params_['class_prior']

# Initialize a new Naive Bayes model with the best hyperparameters
best_naive_bayes = MultinomialNB(
    alpha=best_alpha,
    fit_prior=best_fit_prior,
    class_prior=best_class_prior
)

# Train the final model on the entire training dataset
best_naive_bayes.fit(X_train_tfidf, y_train)

# Train the model on the training data
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_nb = naive_bayes.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_nb = naive_bayes.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_nb = accuracy_score(y_train, y_train_pred_nb)
validation_accuracy_nb = accuracy_score(y_valid, y_valid_pred_nb)

print("Naive Bayes Training Accuracy:", training_accuracy_nb)
print("Naive Bayes Validation Accuracy:", validation_accuracy_nb)
print("Best Alpha:", best_alpha)


Naive Bayes Training Accuracy: 0.8787878787878788
Naive Bayes Validation Accuracy: 0.7083333333333334
Best Alpha: 0.5


# Limitation

Despite our best efforts, we encountered challenges with the Naive Bayes model. Even after extensive hyperparameter tuning, we faced overfitting issues, where the model performed exceptionally well on the training data but struggled to generalize to unseen validation data. Specifically, the Naive Bayes model achieved a high training accuracy of 87.88%, but its validation accuracy remained at a comparatively lower 70.83%. This suggests that the model may have become overly complex or that we need to explore further techniques to control overfitting.