In [27]:
import pandas as pd
from importlib import reload
import sys
sys.path.append('../') 
import text_cleaner  # Import the module first
text_cleaner = reload(text_cleaner)  # Reload the module
from text_cleaner import TextCleaner 
from joblib import dump

from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

## Data loading

In [16]:
df = pd.read_csv('../data/labeled_esg_text.csv')
df = df[['Text','ESG_Category']]
df

Unnamed: 0,Text,ESG_Category
0,"Historically, this category has included emiss...",Environmental
1,We benefited from the sponsorship of Rich Less...,General
2,Direct Air Capture (DAC) technology startup He...,Environmental
3,IT teams implementing HCI in a Windows Server ...,General
4,helps broaden students access to technology by...,General
...,...,...
19208,These governmental standards dictate ES Grelat...,Environmental
19209,Microsoft’s Cloud for Sustainability’s latest ...,Environmental
19210,Read more about what’s upcoming on our Microso...,General
19211,Thinking outside the factory That means manufa...,Environmental


## Text Preprocessing

In [17]:
cleaner = TextCleaner() 
# Apply the text-cleaning methods
df['cleaned_text'] = df['Text'].apply(cleaner.remove_special_characters)
df['cleaned_text'] = df['cleaned_text'].apply(cleaner.lowercase_text)
df['cleaned_text'] = df['cleaned_text'].apply(cleaner.remove_entities)
df['cleaned_text'] = df['cleaned_text'].apply(cleaner.remove_punctuation)
df['cleaned_text'] = df['cleaned_text'].apply(cleaner.remove_stopwords)
df

Unnamed: 0,Text,ESG_Category,cleaned_text
0,"Historically, this category has included emiss...",Environmental,historically category included emissions comme...
1,We benefited from the sponsorship of Rich Less...,General,benefited sponsorship rich lesser global chair...
2,Direct Air Capture (DAC) technology startup He...,Environmental,direct air capture dac technology startup heir...
3,IT teams implementing HCI in a Windows Server ...,General,teams implementing hci windows server deployme...
4,helps broaden students access to technology by...,General,helps broaden students access technology makin...
...,...,...,...
19208,These governmental standards dictate ES Grelat...,Environmental,governmental standards dictate es grelated act...
19209,Microsoft’s Cloud for Sustainability’s latest ...,Environmental,’s cloud sustainability ’s latest release also...
19210,Read more about what’s upcoming on our Microso...,General,read ’s upcoming microsoft industry clouds doc...
19211,Thinking outside the factory That means manufa...,Environmental,thinking outside factory means manufacturers s...


## Train-test-split

In [20]:
X = df['cleaned_text']  # Make sure it's a list of lists
y = df['ESG_Category']

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Print the number of elements in training and testing sets
print('Training Data Length:', len(X_train))
print('Testing Data Length:', len(X_test))

Training Data Length: 13449
Testing Data Length: 5764


## Feature Extraction

In [23]:
tfidf = TfidfVectorizer(lowercase=False)

# Fit on the training data and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the testing data using the same vectorizer (do not fit again)
X_test_tfidf = tfidf.transform(X_test)
# Assuming 'tfidf' is your fitted vectorizer
dump(tfidf, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']

## 1. Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=123)
dt.fit(X_train_tfidf, y_train)

y_pred_train = dt.predict(X_train_tfidf)
y_pred_test = dt.predict(X_test_tfidf)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.9997025801174808
Testing Accuracy score: 0.8648507980569049


In [38]:
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Create a DataFrame with correct ordering of categories
df_conf_matrix = pd.DataFrame(conf_matrix,
                              index=[f"Actual {cls}" for cls in dt.classes_],
                              columns=[f"Predicted {cls}" for cls in dt.classes_])
df_conf_matrix

Unnamed: 0,Predicted Environmental,Predicted General,Predicted Governance,Predicted Social
Actual Environmental,1693,89,0,68
Actual General,90,2332,18,187
Actual Governance,2,22,56,11
Actual Social,104,183,5,904


## 2. Logistic Regression

In [39]:
lr = LogisticRegression(random_state=123)
lr.fit(X_train_tfidf, y_train)

y_pred_train = lr.predict(X_train_tfidf)
y_pred_test = lr.predict(X_test_tfidf)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.9307755223436687
Testing Accuracy score: 0.8721374045801527


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Create a DataFrame with correct ordering of categories
df_conf_matrix = pd.DataFrame(conf_matrix,
                              index=[f"Actual {cls}" for cls in lr.classes_],
                              columns=[f"Predicted {cls}" for cls in lr.classes_])
df_conf_matrix

Unnamed: 0,Predicted Environmental,Predicted General,Predicted Governance,Predicted Social
Actual Environmental,1584,203,1,62
Actual General,35,2538,0,54
Actual Governance,0,50,31,10
Actual Social,85,235,2,874


## 3. Support Vector Machine Classifier

In [41]:
svc =  LinearSVC(class_weight='balanced')
svc.fit(X_train_tfidf, y_train)

y_pred_train = svc.predict(X_train_tfidf)
y_pred_test = svc.predict(X_test_tfidf)
print("\nTraining Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))


Training Accuracy score: 0.9861699754628597
Testing Accuracy score: 0.8839347675225537


In [42]:
conf_matrix = confusion_matrix(y_test, y_pred_test)

# Create a DataFrame with correct ordering of categories
df_conf_matrix = pd.DataFrame(conf_matrix,
                              index=[f"Actual {cls}" for cls in svc.classes_],
                              columns=[f"Predicted {cls}" for cls in svc.classes_])
df_conf_matrix

Unnamed: 0,Predicted Environmental,Predicted General,Predicted Governance,Predicted Social
Actual Environmental,1658,123,1,68
Actual General,67,2409,9,142
Actual Governance,0,19,64,8
Actual Social,76,146,10,964


## 4. Ensembling

In [43]:
from sklearn.ensemble import VotingClassifier

classifiers = [('Decision Tree', dt),
               ('Logistic Regression', lr),
                ('Support Vector machine', svc)
              ]
vc = VotingClassifier(estimators=classifiers)
# Fit 'vc' to the traing set and predict test set labels
vc.fit(X_train_tfidf, y_train)
y_pred_train=vc.predict(X_train_tfidf)
y_pred_test = vc.predict(X_test_tfidf)
print("Training Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy score: 0.988177559669864
Testing Accuracy score: 0.8941707147814018


In [45]:
# Generate a classification report to evaluate the model
print(classification_report(y_test, y_pred_test))

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_test)
print(f"Accuracy: {accuracy:.2f}")

               precision    recall  f1-score   support

Environmental       0.93      0.90      0.91      1850
      General       0.88      0.95      0.92      2627
   Governance       0.90      0.59      0.72        91
       Social       0.87      0.78      0.82      1196

     accuracy                           0.89      5764
    macro avg       0.89      0.81      0.84      5764
 weighted avg       0.89      0.89      0.89      5764

Accuracy: 0.89


## Model Saving

In [44]:
from joblib import dump, load

# Save the model to disk
model_filename = 'voting_classifier_model.joblib'
dump(vc, model_filename)
print("Model saved to", model_filename)

Model saved to voting_classifier_model.joblib
