In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline, BertTokenizer
from importlib import reload
import text_cleaner  # Import the module first
text_cleaner = reload(text_cleaner)  # Reload the module
from text_cleaner import TextCleaner  

In [None]:
df_news = pd.read_csv('data/google-news-sentences.csv')
df_news = df_news[['content','date']]
df_reports = pd.read_csv('data/reports_sentences.csv')
df_reports = df_reports[['Sentence','Year']]
df_news.rename(columns={'content': 'Text', 'date': 'Date'}, inplace=True)
df_reports.rename(columns={'Sentence': 'Text', 'Year': 'Date'}, inplace=True)
# Concatenate the DataFrames
df_combined = pd.concat([df_news, df_reports], ignore_index=True)

df_combined

In [None]:
nan_rows = df_combined[df_combined['Text'].isna()]
df_combined.dropna(subset=['Text'], inplace=True)

## Pre-trained transformer: ESGBert
### Baseline 

In [None]:
#Environmental model
env_model_name = "ESGBERT/EnvironmentalBERT-environmental"
env_tokenizer = AutoTokenizer.from_pretrained(env_model_name)
env_model = AutoModelForSequenceClassification.from_pretrained(env_model_name)
pipe_env = pipeline("text-classification", model=env_model, tokenizer=env_tokenizer)

# Social model
soc_model_name = "ESGBERT/SocialBERT-social"
soc_tokenizer = AutoTokenizer.from_pretrained(soc_model_name)
soc_model = AutoModelForSequenceClassification.from_pretrained(soc_model_name)
pipe_soc = pipeline("text-classification", model=soc_model, tokenizer=soc_tokenizer)

# Governance model
gov_model_name = "ESGBERT/GovernanceBERT-governance"
gov_tokenizer = AutoTokenizer.from_pretrained(gov_model_name)
gov_model = AutoModelForSequenceClassification.from_pretrained(gov_model_name)
pipe_gov = pipeline("text-classification", model=gov_model, tokenizer=gov_tokenizer)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def is_long_sentence(sentence, max_length=512):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    return len(tokens) > max_length
# Apply the function to the 'text' column
df_combined['is_long'] = df_combined['Text'].apply(is_long_sentence)
df_combined = df_combined[df_combined['is_long']==False]
df_combined

In [None]:
cleaner = TextCleaner() 
# Apply the text-cleaning methods
df_combined['Text'] = df_combined['Text'].apply(cleaner.remove_special_characters)

In [None]:
def classify_text(text):
    # Apply each model
    env_result = pipe_env(text)
    soc_result = pipe_soc(text)
    gov_result = pipe_gov(text)

    # Extract the scores and labels, adjusting for 'none' predictions
    env_score = env_result[0]['score'] if env_result and env_result[0]['label'] != 'none' else 0
    soc_score = soc_result[0]['score'] if soc_result and soc_result[0]['label'] != 'none' else 0
    gov_score = gov_result[0]['score'] if gov_result and gov_result[0]['label'] != 'none' else 0

    # Create a dictionary of scores with their corresponding labels
    scores = {
        'Environmental': env_score,
        'Social': soc_score,
        'Governance': gov_score
    }

    # Check if all scores are zero or if 'none' condition affected all
    if all(score == 0 for score in scores.values()):
        return 'General'  # or 'None' or any other default category

    # Determine the category with the highest score
    max_category = max(scores, key=scores.get)

    return max_category

# Assuming test_df is your DataFrame and 'Text' is the column with text to classify
test_df['ESG_Category'] = test_df['Text'].apply(classify_text)
test_df

In [None]:
test_df.to_csv('data/labeled_esg_text.csv')