# Corporate Sustainability

## 0 - Imports and dependencies

### Dependencies

To run this notebook, please make sure you have the following packages installed:

- `pandas`: For handling data in dataframes.
- `datasets`: For loading datasets.
- `spacy`: For natural language processing.
- `scikit-learn`: For machine learning algorithms, vectorization, model evaluation, and pipelines.
- `wordcloud`: For creating a visual interpretation of text analytics

You can install these packages using the following command:

```bash
pip install pandas datasets spacy scikit-learn wordcloud


python -m spacy download en_core_web_sm


In [None]:
# General imports
import pandas as pd
import spacy
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt

## 1 - Data loading

In [None]:
# Load data
dataset = load_dataset('climatebert/environmental_claims')

In [None]:
# Display to show how the format of the dataset looks like
dataset

In [None]:
# Creating a dataframe for each split

env_claim_train = pd.DataFrame(dataset['train'])
env_claim_test = pd.DataFrame(dataset['test'])
env_claim_val = pd.DataFrame(dataset['validation'])


In [None]:
# Define color codes
class Colors:
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    ENDC = '\033[0m'

# Displaying the characteristics of the data
print(Colors.OKGREEN + "Training data" + Colors.ENDC)
print(f"The shape of the training data is: {env_claim_train.shape}")
display(env_claim_train.head())
display(env_claim_train.describe())

print(Colors.OKBLUE + "\nTest data" + Colors.ENDC)
print(f"The shape of the test data is: {env_claim_test.shape}")
display(env_claim_test.head())
display(env_claim_test.describe())

print(Colors.OKCYAN + "\nValidation data" + Colors.ENDC)
print(f"The shape of the validation data is: {env_claim_val.shape}")
display(env_claim_val.head())
display(env_claim_val.describe())


### Declaring the train and test datasets

In [None]:
X_train, y_train = env_claim_train['text'], env_claim_train['label']
X_test, y_test = env_claim_test['text'], env_claim_test['label']

### Text Classification using TF-IDF Vectorization and Logistic Regression

In [None]:

# Create a spaCy tokenizer
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def spacy_tokenizer(text):
    return [tok.lemma_.lower().strip() for tok in nlp(text) if tok.is_alpha and not tok.is_stop]

# Create a TfidfVectorizer with the spaCy tokenizer
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 2), max_df=0.85, min_df=2)

# Create a logistic regression classifier
clf = LogisticRegression(solver='liblinear')

# Create a pipeline with the vectorizer and classifier
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', clf)])

# Train the model
pipe.fit(X_train, y_train)

# Test the model
y_pred = pipe.predict(X_test)

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)



In [None]:
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

### Visualizing Word Importance with Word Clouds for Each Class

In [None]:
def plot_word_cloud(classifier, vectorizer, class_index, class_label):
    # Get the feature names (words) from the vectorizer
    words = vectorizer.get_feature_names_out()

    # Get the coefficients (importance) of the words for the given class
    word_importance = classifier.coef_[0]

    # Create a dictionary with the words and their importance
    if class_index == 0:
        colormap = "Reds"
        word_importance_dict = {words[i]: -word_importance[i] for i in range(len(words)) if word_importance[i] < 0}
    else:
        word_importance_dict = {words[i]: word_importance[i] for i in range(len(words)) if word_importance[i] > 0}
        colormap = "Greens"

    # Create a WordCloud object
    wc = WordCloud(width=800, height=400, background_color="white", colormap=colormap)

    # Generate the word cloud using the word importance dictionary
    wc.generate_from_frequencies(word_importance_dict)

    # Plot the word cloud
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for {class_label}", fontsize=20)
    plt.show()

# Create the word clouds for each class
plot_word_cloud(clf, vectorizer, 0, "Greenwashing Companies")
plot_word_cloud(clf, vectorizer, 1, "Genuine Green impact Companies")


## Environment description

In [None]:
%load_ext watermark
%watermark -v -p pandas,numpy,sklearn,datasets,spacy,wordcloud