In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
dataset = load_dataset("climatebert/climate_sentiment")

# Convert the train split to a Pandas DataFrame
train_df = dataset['train'].to_pandas()

# Convert the test split to a Pandas DataFrame
test_df = dataset['test'].to_pandas()

# Concatenate train and test dataframes
dataframe = pd.concat([train_df, test_df], ignore_index=True)
dataframe.columns = ['message', 'sentiment']
print(dataframe.head())
print(dataframe.keys())

                                             message  sentiment
0  − Scope 3: Optional scope that includes indire...          1
1  The Group is not aware of any noise pollution ...          0
2  Global climate change could exacerbate certain...          0
3  Setting an investment horizon is part and parc...          0
4  Climate change the physical impacts of climate...          0
Index(['message', 'sentiment'], dtype='object')


In [2]:
pd.set_option('display.max_colwidth', None)
dataframe

Unnamed: 0,message,sentiment
0,"− Scope 3: Optional scope that includes indirect emissions associated with the goods and services supply chain produced outside the organization. Included are emissions from the transport of products from our logistics centres to stores (downstream) performed by external logistics operators (air, land and sea transport) as well as the emissions associated with electricity consumption in franchise stores.",1
1,"The Group is not aware of any noise pollution that could negatively impact the environment, nor is it aware of any impact on biodiversity. With regards to land use, the Group is only a commercial user, and the Group is not aware of any local constraints with regards to water supply. The Group does not believe that it is at risk with regards to climate change in the near-or mid-term.",0
2,"Global climate change could exacerbate certain of the threats facing our business, including the frequency and severity of weather-related events referred to in Performance of critical infrastructure in this section 9. In addition, increases in energy prices are partly influenced by government policies to address climate change which, combined with a growing data demand that increases our energy requirements, could increase our energy costs beyond our current expectations.",0
3,"Setting an investment horizon is part and parcel of our policy of focusing on the long term and helping clients to build capital. Both financial and non-financial aspects play a role in measuring investment returns. Even if we make a successful investment in a mining company today, the same company may nonetheless cause damage to the environment tomorrow, and thus be compelled to make substantial provisions for improving its waste-processing activities and paying fines. As an asset manager that focuses on the long-term prospects, we can’t ignore the non-financial aspects.",0
4,"Climate change the physical impacts of climate change on our operations are uncertain and particular to geographic circumstances. in addition, a number of national governments have already introduced or are contemplating the introduction of regulatory responses to greenhouse gas emissions from the combustion of fossil fuels to address the impacts of climate change. these physical effects and regulatory responses may adversely impact the productivity and financial performance of our operations.",0
...,...,...
1315,"Indirect emissions result from operational activities we do not own or control. These include indirect energy emissions produced as a consequence of electricity we purchase to power our treatment plants and other indirect emissions as a consequence of our activities, e.g. from travel on company business and sludge and process waste disposal emissions.",1
1316,"All data in this TCFD report is as of, or for the year-ended December 31, 2020 unless otherwise noted. References to Daimler’s Sustainability Report 2020 will be available with its publication by March 29, 2021. References to the CDP Climate Change Questionnaire are related to the reporting year 2019.",1
1317,"Outcome: The bank explained that it would be winding down its fossil fuel-related merger and acquisition advice, investing substantially in clean tech and banking services, and that it was preparing its first TCFD report.",1
1318,"In 2020, Banco do Brasil Foundation celebrated its 35th anniversary. Along its journey, it has contributed to the societal transformation of Brazilians and the sustainable development of the country, focused on serving the society’s most vulnerable segments, from north to south, from east to west, in cities and the countryside.",2


In [3]:
import re

def clean_message(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply the cleaning function to the "message" column
dataframe['message'] = dataframe['message'].apply(clean_message)

In [4]:
dataframe

Unnamed: 0,message,sentiment
0,Scope Optional scope that includes indirect emissions associated with the goods and services supply chain produced outside the organization Included are emissions from the transport of products from our logistics centres to stores downstream performed by external logistics operators air land and sea transport as well as the emissions associated with electricity consumption in franchise stores,1
1,The Group is not aware of any noise pollution that could negatively impact the environment nor is it aware of any impact on biodiversity With regards to land use the Group is only a commercial user and the Group is not aware of any local constraints with regards to water supply The Group does not believe that it is at risk with regards to climate change in the nearor midterm,0
2,Global climate change could exacerbate certain of the threats facing our business including the frequency and severity of weatherrelated events referred to in Performance of critical infrastructure in this section In addition increases in energy prices are partly influenced by government policies to address climate change which combined with a growing data demand that increases our energy requirements could increase our energy costs beyond our current expectations,0
3,Setting an investment horizon is part and parcel of our policy of focusing on the long term and helping clients to build capital Both financial and nonfinancial aspects play a role in measuring investment returns Even if we make a successful investment in a mining company today the same company may nonetheless cause damage to the environment tomorrow and thus be compelled to make substantial provisions for improving its wasteprocessing activities and paying fines As an asset manager that focuses on the longterm prospects we cant ignore the nonfinancial aspects,0
4,Climate change the physical impacts of climate change on our operations are uncertain and particular to geographic circumstances in addition a number of national governments have already introduced or are contemplating the introduction of regulatory responses to greenhouse gas emissions from the combustion of fossil fuels to address the impacts of climate change these physical effects and regulatory responses may adversely impact the productivity and financial performance of our operations,0
...,...,...
1315,Indirect emissions result from operational activities we do not own or control These include indirect energy emissions produced as a consequence of electricity we purchase to power our treatment plants and other indirect emissions as a consequence of our activities eg from travel on company business and sludge and process waste disposal emissions,1
1316,All data in this TCFD report is as of or for the yearended December unless otherwise noted References to Daimlers Sustainability Report will be available with its publication by March References to the CDP Climate Change Questionnaire are related to the reporting year,1
1317,Outcome The bank explained that it would be winding down its fossil fuelrelated merger and acquisition advice investing substantially in clean tech and banking services and that it was preparing its first TCFD report,1
1318,In Banco do Brasil Foundation celebrated its th anniversary Along its journey it has contributed to the societal transformation of Brazilians and the sustainable development of the country focused on serving the societys most vulnerable segments from north to south from east to west in cities and the countryside,2


In [6]:
def tokenize_message(text):
    return word_tokenize(text)

dataframe['tokenized_message'] = dataframe['message'].apply(tokenize_message)

In [7]:
dataframe = dataframe[['sentiment', 'tokenized_message']]

In [8]:
X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM model
svm = SVC()

# Train the model on the training data
svm.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_svm = svm.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_svm = svm.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
validation_accuracy_svm = accuracy_score(y_valid, y_valid_pred_svm)

print("SVM Training Accuracy:", training_accuracy_svm)
print("SVM Validation Accuracy:", validation_accuracy_svm)

SVM Training Accuracy: 0.9962121212121212
SVM Validation Accuracy: 0.7613636363636364


In [11]:
from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters to search over
param_grid = {
    'C': [10, 20 ,30],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel function
    'gamma': ['scale', 'auto', 0.1, 1],  # Kernel coefficient (only for 'rbf' and 'poly' kernels)
}

# Initialize the SVM model
svm = SVC()

# Initialize the GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1)

# Perform hyperparameter tuning on the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use the best hyperparameters to create a new SVM model
best_svm = SVC(**best_params)

# Train the new SVM model on the training data
best_svm.fit(X_train_tfidf, y_train)

# Make predictions on the training set with the best model
y_train_pred_best_svm = best_svm.predict(X_train_tfidf)

# Make predictions on the validation set with the best model
y_valid_pred_best_svm = best_svm.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets with the best model
training_accuracy_best_svm = accuracy_score(y_train, y_train_pred_best_svm)
validation_accuracy_best_svm = accuracy_score(y_valid, y_valid_pred_best_svm)

print("Best SVM Hyperparameters:", best_params)
print("Best SVM Training Accuracy:", training_accuracy_best_svm)
print("Best SVM Validation Accuracy:", validation_accuracy_best_svm)

Best SVM Hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best SVM Training Accuracy: 0.9962121212121212
Best SVM Validation Accuracy: 0.803030303030303


In [12]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes model
naive_bayes = MultinomialNB()

# Train the model on the training data
naive_bayes.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_nb = naive_bayes.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_nb = naive_bayes.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_nb = accuracy_score(y_train, y_train_pred_nb)
validation_accuracy_nb = accuracy_score(y_valid, y_valid_pred_nb)

print("Naive Bayes Training Accuracy:", training_accuracy_nb)
print("Naive Bayes Validation Accuracy:", validation_accuracy_nb)

Naive Bayes Training Accuracy: 0.8200757575757576
Naive Bayes Validation Accuracy: 0.6931818181818182


In [13]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000)

# Train the model on the training data
logistic_regression.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_lr = logistic_regression.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_lr = logistic_regression.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
validation_accuracy_lr = accuracy_score(y_valid, y_valid_pred_lr)

print("Logistic Regression Training Accuracy:", training_accuracy_lr)
print("Logistic Regression Validation Accuracy:", validation_accuracy_lr)

Logistic Regression Training Accuracy: 0.9526515151515151
Logistic Regression Validation Accuracy: 0.7727272727272727
