# Corporate Sustainability

## 0 - Imports and Dependencies

### Dependencies

To run this notebook, please make sure you have the following packages installed:

- `pandas`: For handling data in dataframes.
- `datasets`: For loading datasets.
- `spacy`: For natural language processing.
- `scikit-learn`: For machine learning algorithms, vectorization, model evaluation, and pipelines.
- `wordcloud`: For creating a visual interpretation of text analytics

You can install these packages using the following command:

```bash
pip install pandas datasets spacy scikit-learn wordcloud


python -m spacy download en_core_web_sm


In [None]:
# General imports
import pandas as pd
import numpy as np
import spacy
from textblob import TextBlob
import plotly.express as px
import seaborn as sns
from collections import Counter
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt

## 1 - Data Loading

In [None]:
# Load data
dataset = load_dataset('climatebert/environmental_claims')

In [None]:
# Display to show how the format of the dataset looks like
dataset

In [None]:
# Creating a dataframe for each split

env_claim_train = pd.DataFrame(dataset['train'])
env_claim_test = pd.DataFrame(dataset['test'])
env_claim_val = pd.DataFrame(dataset['validation'])


In [None]:
# Define color codes
class Colors:
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    ENDC = '\033[0m'

# Displaying the characteristics of the data
print(Colors.OKGREEN + "Training data" + Colors.ENDC)
print(f"The shape of the training data is: {env_claim_train.shape}")
display(env_claim_train.head())
display(env_claim_train.describe())

print(Colors.OKBLUE + "\nTest data" + Colors.ENDC)
print(f"The shape of the test data is: {env_claim_test.shape}")
display(env_claim_test.head())
display(env_claim_test.describe())

print(Colors.OKCYAN + "\nValidation data" + Colors.ENDC)
print(f"The shape of the validation data is: {env_claim_val.shape}")
display(env_claim_val.head())
display(env_claim_val.describe())

## 2- EDA

In [None]:
# Concatenate sets 
claim_dataset = pd.concat([env_claim_train, env_claim_test, env_claim_val], ignore_index = True)
print("Number of claims in the dataset:", claim_dataset.shape[0])    # observations
print("Number of variables in the dataset:", claim_dataset.shape[1]) # variables 

In [None]:
# NaNs 
print("Number of NaNs:")
display(claim_dataset.isna().sum())

# Duplicates
print("Number of duplicates:")
display(claim_dataset.duplicated().sum())

# Variable types
print("Variable types:")
claim_dataset.dtypes

### Word Count by Claim

In [None]:
# Word count
claim_dataset["word count"] = claim_dataset["text"].apply(lambda x: len(x.split()))
print("The average number of words in each claim is equal to:", round(claim_dataset["word count"].mean(),0), "words.")

# Graphical representation by class
class_1_counts = claim_dataset[claim_dataset["label"] == 1]["word count"]
class_2_counts = claim_dataset[claim_dataset["label"] == 0]["word count"]

plt.hist(class_1_counts, bins = range(11, 39), alpha = 0.5, label = "Environmental Claim", color = "#4958B5")
plt.hist(class_2_counts, bins = range(11, 39), alpha = 0.5, label = "Non-environmental Claim", color = "#8DB8B7")
plt.xlabel("Word count")
plt.ylabel("Frequency")
plt.title("Number of Words in Each Claim by Class")
plt.legend(loc = "upper right")
plt.show()

### Claim Cleaning

In [None]:
# Load English language model
sp = spacy.load('en_core_web_sm')

# Apply the Spacy sp function to each row of the 'text' column
claim_dataset["spacy object"] = claim_dataset["text"].apply(sp)

# Filter stopwords, punctuation and spaces
def filter_tokens(token):
    return not token.is_stop and not token.is_punct and not token.is_space

# Remove stopwords, punctuation, and whitespace from each Spacy object
claim_dataset["filtered tokens"] = claim_dataset["spacy object"].apply(lambda doc: [token.text for token in doc if filter_tokens(token)])

print("This is the first sentence before filtering:", claim_dataset.iloc[0,0])
print("\nThis is the first sentence after filtering:", claim_dataset.iloc[0,4])

# Calculating new average value of words per claim
number_words = [len(x) for x in claim_dataset["filtered tokens"]]
print("\nThe average number of words per claim is now:", round(np.mean(number_words),0))

### Environmental Claims versus Non-Environmental Claims

In [None]:
# Mean 
print("Average number of words per claim by class:")
display(claim_dataset.groupby("label").mean().round())

# Median
print("\nMedian number of words per claim by class:")
display(claim_dataset.groupby("label").median())

In [None]:
# WordCloud hue by class label

# Join the strings in each list into a single string
claim_dataset["joined tokens"] = claim_dataset["filtered tokens"].apply(lambda tokens: ' '.join(tokens))

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 6))

# For Environmental Claims
text = " ".join(word for word in claim_dataset[claim_dataset["label"]==1]["joined tokens"])
wordcloud = WordCloud( background_color = "white", colormap = "Greens").generate(text)

ax1.imshow(wordcloud, interpolation = "bilinear")
ax1.set(title = "WordCloud of Environmental Claims")
ax1.axis("off")

# For Non-Environmental Claims
text = " ".join(word for word in claim_dataset[claim_dataset["label"]==0]["joined tokens"])
wordcloud = WordCloud(background_color = "white", colormap = "Reds").generate(text)

ax2.imshow(wordcloud, interpolation='bilinear')
ax2.set(title = "WordCloud of Non-Environmental Claims")
ax2.axis("off")
plt.show()

In [None]:
# Most frequent words for environmental claims
top = Counter([item for sublist in claim_dataset["joined tokens"][claim_dataset["label"]==1] for item in str(sublist).split()])
temp = pd.DataFrame(top.most_common(10))
temp.columns = ["Common Words", "Count"]
print("Most frequent words for environmental claims:")
display(temp.style.background_gradient(cmap = "Greens"))

# Most frequent words for non-environmental claims
top = Counter([item for sublist in claim_dataset["joined tokens"][claim_dataset["label"]==0] for item in str(sublist).split()])
temp = pd.DataFrame(top.most_common(10))
temp.columns = ["Common Words", "Count"]
print("\nMost frequent words for non-environmental claims:")
display(temp.style.background_gradient(cmap = "Reds"))

### Studying Energy Claims using N-Grams

In [None]:
# Need to finish this part

In [None]:
energy_df = claim_dataset[claim_dataset["joined tokens"].str.contains("energy")]
print("In the original dataset, there are", len(energy_df), "claims containing the word 'energy'.")

In [None]:
# Function calculating most frequent N-Grams given corpus, top number of n-grams and n-grams
def top_n_ngram(energy_corpus, n = None, ngram = 3):
    vec = CountVectorizer(ngram_range = (ngram,ngram)).fit(energy_corpus)
    words_bag = vec.transform(energy_corpus) # Have the count of  all the words for each claim
    sum_words = words_bag.sum(axis =0)       # Calculates the count of all the word in the whole claim
    words_freq = [(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq,key = lambda x:x[1],reverse = True)
    return words_freq[:n]


# Plotting the results
pop_words = top_n_ngram(energy_df["joined tokens"], 29,3)     # Call function
pop_energy = [t for t in pop_words if "energy" in t[0]]       # Select N-Grams having 'energy' in text
df_n = pd.DataFrame(pop_energy, columns = ["Text", "count"])
colors = np.where(energy_df["label"] == 1, 'g', 'r')                           # Assign color based on label value
plt.figure(figsize = (20,5))
df_n.groupby("Text").sum()['count'].sort_values(ascending=False).plot(
kind = "bar", title = "20 Most Frequent 3-grams Containing Word 'Energy'", ylabel = "Count", color = colors)

In [None]:
[t for t in pop_words if "renewable energy sources" in t[0]]

In [None]:
[t for t in energy_df["joined tokens"] if "renewable energy sources" in t]

### Declaring the train and test datasets for X and y

In [None]:
X_train, y_train = env_claim_train['text'], env_claim_train['label']
X_test, y_test = env_claim_test['text'], env_claim_test['label']
X_val, y_val = env_claim_val['text'], env_claim_test['label']

### Exploring Labelled Data and Defining Base Rate

In [None]:
# Train Set
print(Colors.OKGREEN + "Train set per class:" + Colors.ENDC)
display(y_train.value_counts())
      

# Test Set 
print(Colors.OKBLUE + "\nTest set per class:" + Colors.ENDC)
display(y_test.value_counts())

# Validation Set
print(Colors.OKCYAN + "\nValidation set per class:" + Colors.ENDC)
display(y_val.value_counts())


# Graphical representation
outcome_variable = pd.concat([y_train, y_test, y_val])
outcome_variable.value_counts().plot.bar(color = ["#4958B5", "#8DB8B7"], grid = False)
plt.ylabel("Number of observations")
plt.xlabel("Class")
plt.title("Number of Observations per Class")
plt.xticks(rotation = 0)
plt.show()

In [None]:
# Base rate
base_rate = round(len(outcome_variable[outcome_variable == 0]) / len (outcome_variable), 4)
print(f'The base rate is: {base_rate*100:0.2f}%')

### Balancing Labelled Data

In [None]:
# Get indices of "0" outcomes in the training set
train_zeros_idx = pd.Series(y_train[y_train == 0].index)

# Randomly select a balanced number of "0" outcomes
train_zeros_sample_idx = train_zeros_idx.sample(n = sum(y_train == 1), random_state = 7)

# Use the sampled indices to get the final balanced training set
X_train_bal = pd.concat([X_train[y_train == 1], X_train[train_zeros_sample_idx]])
y_train_bal = pd.concat([y_train[y_train == 1], y_train[train_zeros_sample_idx]])


# Get indices of "0" outcomes in the test set
test_zeros_idx = pd.Series(y_test[y_test == 0].index)

# Randomly select a balanced number of "0" outcomes
test_zeros_sample_idx = test_zeros_idx.sample(n = sum(y_test == 1), random_state = 7)

# Use the sampled indices to get the final balanced test set
X_test_bal = pd.concat([X_test[y_test == 1], X_test[test_zeros_sample_idx]])
y_test_bal = pd.concat([y_test[y_test == 1], y_test[test_zeros_sample_idx]])


# Get indices of "0" outcomes in the validation set
val_zeros_idx = pd.Series(y_val[y_val == 0].index)

# Randomly select a balanced number of "0" outcomes
val_zeros_sample_idx = val_zeros_idx.sample(n = sum(y_val == 1), random_state = 7)

# Use the sampled indices to get the final balanced validation set
X_val_bal = pd.concat([X_val[y_val == 1], X_val[val_zeros_sample_idx]])
y_val_bal = pd.concat([y_val[y_val == 1], y_val[val_zeros_sample_idx]])

In [None]:
print("Number of observations per class after balancing the classes:\n")

# Train Set
print(Colors.OKGREEN + "Train set per class" + Colors.ENDC)
display(y_train_bal.value_counts())
      

# Test Set 
print(Colors.OKBLUE + "\nTest set per class" + Colors.ENDC)
display(y_test_bal.value_counts())

# Validation Set
print(Colors.OKCYAN + "\nValidation set per class" + Colors.ENDC)
display(y_val_bal.value_counts())

print("\nThe new balanced dataset contains", len(y_train_bal + y_test_bal + y_val_bal) , "observations.")

## 3- 

### Text Classification using TF-IDF Vectorization and Logistic Regression

In [None]:
# Create a spaCy tokenizer
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def spacy_tokenizer(text):
    return [tok.lemma_.lower().strip() for tok in nlp(text) if tok.is_alpha and not tok.is_stop]

# Create a TfidfVectorizer with the spaCy tokenizer
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 2), max_df=0.85, min_df=2)

# Create a logistic regression classifier
clf = LogisticRegression(solver='liblinear')

# Create a pipeline with the vectorizer and classifier
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', clf)])

# Train the model
pipe.fit(X_train, y_train)

# Test the model
y_pred = pipe.predict(X_test)

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

### Visualizing Word Importance with Word Clouds for Each Class

In [None]:
def plot_word_cloud(classifier, vectorizer, class_index, class_label):
    # Get the feature names (words) from the vectorizer
    words = vectorizer.get_feature_names_out()

    # Get the coefficients (importance) of the words for the given class
    word_importance = classifier.coef_[0]

    # Create a dictionary with the words and their importance
    if class_index == 0:
        colormap = "Reds"
        word_importance_dict = {words[i]: -word_importance[i] for i in range(len(words)) if word_importance[i] < 0}
    else:
        word_importance_dict = {words[i]: word_importance[i] for i in range(len(words)) if word_importance[i] > 0}
        colormap = "Greens"

    # Create a WordCloud object
    wc = WordCloud(width=800, height=400, background_color="white", colormap=colormap)

    # Generate the word cloud using the word importance dictionary
    wc.generate_from_frequencies(word_importance_dict)

    # Plot the word cloud
    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for {class_label}", fontsize=20)
    plt.show()

# Create the word clouds for each class
plot_word_cloud(clf, vectorizer, 0, "Greenwashing Companies")
plot_word_cloud(clf, vectorizer, 1, "Genuine Green impact Companies")


## Environment description

In [None]:
%load_ext watermark
%watermark -v -p pandas,numpy,sklearn,datasets,spacy,wordcloud