# Reading Library

In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import os

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
dataset = load_dataset("climatebert/climate_sentiment")

# Convert the train split to a Pandas DataFrame
train_df = dataset['train'].to_pandas()

# Convert the test split to a Pandas DataFrame
test_df = dataset['test'].to_pandas()

# Concatenate train and test dataframes
dataframe = pd.concat([train_df, test_df], ignore_index=True)
dataframe.columns = ['message', 'sentiment']
print(dataframe.head())
print(dataframe.keys())

                                             message  sentiment
0  − Scope 3: Optional scope that includes indire...          1
1  The Group is not aware of any noise pollution ...          0
2  Global climate change could exacerbate certain...          0
3  Setting an investment horizon is part and parc...          0
4  Climate change the physical impacts of climate...          0
Index(['message', 'sentiment'], dtype='object')


In [2]:
import re

def clean_message(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply the cleaning function to the "message" column
dataframe['message'] = dataframe['message'].apply(clean_message)

def tokenize_message(text):
    return word_tokenize(text)

dataframe['tokenized_message'] = dataframe['message'].apply(tokenize_message)

# Remove stop words
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Change 'english' to your language if needed

def remove_stopwords(tokenized_text):
    return [word for word in tokenized_text if word.lower() not in stop_words]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(remove_stopwords)

# Convert all words to lowercase
def convert_to_lowercase(tokenized_text):
    return [word.lower() for word in tokenized_text]

dataframe['tokenized_message'] = dataframe['tokenized_message'].apply(convert_to_lowercase)

dataframe = dataframe[['sentiment', 'tokenized_message']]

X = dataframe['tokenized_message']
y = dataframe['sentiment']
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert lists of tokens back into strings
X_train = X_train.apply(lambda x: ' '.join(x))
X_test = X_test.apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_test)



# Support Vector Machine (SVM) Model

In [3]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM model
svm = SVC()

# Train the model on the training data
svm.fit(X_train_tfidf, y_train)

# Make predictions on the training set
y_train_pred_svm = svm.predict(X_train_tfidf)

# Make predictions on the validation set
y_valid_pred_svm = svm.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets
training_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
validation_accuracy_svm = accuracy_score(y_valid, y_valid_pred_svm)

print("SVM Training Accuracy:", training_accuracy_svm)
print("SVM Validation Accuracy:", validation_accuracy_svm)

SVM Training Accuracy: 0.9962121212121212
SVM Validation Accuracy: 0.7613636363636364


In [None]:
from sklearn.model_selection import GridSearchCV

# Define a range of hyperparameters to search over
param_grid = {
    'C': [10, 20 ,30],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel function
    'gamma': ['scale', 'auto', 0.1, 1],  # Kernel coefficient (only for 'rbf' and 'poly' kernels)
}

# Initialize the SVM model
svm = SVC()

# Initialize the GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1)

# Perform hyperparameter tuning on the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Use the best hyperparameters to create a new SVM model
best_svm = SVC(**best_params)

# Train the new SVM model on the training data
best_svm.fit(X_train_tfidf, y_train)

# Make predictions on the training set with the best model
y_train_pred_best_svm = best_svm.predict(X_train_tfidf)

# Make predictions on the validation set with the best model
y_valid_pred_best_svm = best_svm.predict(X_valid_tfidf)

# Calculate accuracy on the training and validation sets with the best model
training_accuracy_best_svm = accuracy_score(y_train, y_train_pred_best_svm)
validation_accuracy_best_svm = accuracy_score(y_valid, y_valid_pred_best_svm)

print("Best SVM Hyperparameters:", best_params)
print("Best SVM Training Accuracy:", training_accuracy_best_svm)
print("Best SVM Validation Accuracy:", validation_accuracy_best_svm)

Best SVM Hyperparameters: {'C': 20, 'gamma': 0.1, 'kernel': 'rbf'}
Best SVM Training Accuracy: 1.0
Best SVM Validation Accuracy: 0.7916666666666666


# Limitation

SVM Model Limitations:

Similarly, with the SVM model, we encountered certain limitations. Although we performed hyperparameter tuning and obtained the best hyperparameters (C=20, gamma=0.1, kernel='rbf'), we still observed a slight performance gap between the training accuracy (100%) and the validation accuracy (79.17%). This discrepancy implies that, despite optimization efforts, there might be inherent limitations in the SVM's ability to generalize to unseen data. The SVM model excels at finding complex decision boundaries, but it may not fully capture the intricacies of sentiment analysis, especially in cases where the data exhibits high nonlinearity or complex relationships. Further strategies may be necessary to mitigate this performance gap and enhance model generalization.