In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset


2024-04-29 13:16:36.157224: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def asignar_pilar(texto, diccionario_pilares):
    for pilar, palabras in diccionario_pilares.items():
        if any(palabra in texto.lower() for palabra in palabras):
            return pilar
    return "Other"

In [3]:
def label_to_num(label):
    label_dict = {"Environmental": 0, "Social Internal": 1, "Social External": 2, "Governance": 3, "Other": 4}
    return label_dict.get(label, 4)  # Default to 4 si no se encuentra

In [4]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [19]:
df=pd.read_csv("data/data_processed_webster.csv")
df.head()

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.32682,9.2e-05,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,5.8e-05,0.009292,2.6e-05,0.99527,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,5.9e-05,0.006926,9.8e-05,0.99638,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.05127,0.000167,0.973449,2023-09-20


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11395 entries, 0 to 15966
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Review           11395 non-null  object 
 1   Bank Name        11395 non-null  object 
 2   data_source      11395 non-null  object 
 3   Bank ID          11395 non-null  int64  
 4   state            11395 non-null  object 
 5   Sentiment_gen    11395 non-null  object 
 6   POSITIVE_Count   11395 non-null  int64  
 7   Positive_gen     11395 non-null  float64
 8   Negative_gen     11395 non-null  float64
 9   Neutral_gen      11395 non-null  float64
 10  Mixed_gen        11395 non-null  float64
 11  Sentiment_Score  11395 non-null  float64
 12  ds               11395 non-null  object 
dtypes: float64(5), int64(2), object(6)
memory usage: 1.2+ MB


In [21]:
df.dropna(inplace=True)
df.reset_index(drop=True)

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.326820,0.000092,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,0.000058,0.009292,0.000026,0.995270,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,0.000059,0.006926,0.000098,0.996380,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.051270,0.000167,0.973449,2023-09-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11390,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.004506,0.008736,0.986738,0.000020,0.497875,2024-04-24
11391,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.003693,0.008020,0.988272,0.000016,0.497829,2024-04-24
11392,"April 24, 2024 at 16:41 PM EDT Webster Financ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.005492,0.005953,0.988536,0.000018,0.499761,2024-04-24
11393,(RTTNews) - Webster Financial Corporation (WBS...,Webster Bank,news,24,EEUU,NEUTRAL,0,0.001876,0.012448,0.985664,0.000012,0.494708,2024-04-24


In [23]:
# Extended Keywords for Each Pillar

# Environmental Pillar Keywords
environmental_keywords = [
    "eco", "green", "sustainable", "conservation", "energy", "emissions", "climate", 
    "biodiversity", "organic", "renewable", "carbon", "recycle", "waste", "pollution", 
    "natural", "resources", "efficiency", "environmental", "protection", "wildlife", 
    "solar", "wind", "geothermal", "water", "forest", "air quality", "clean tech", "habitat", 
    "earth", "ozone", "footprint", "eco-friendly", "afforestation", "air quality", 
    "alternative energy", "animal welfare", "aquaculture", "biophilic", "carbon-neutral", 
    "clean energy", "climate policy", "composting", "deforestation", "eco-design", 
    "eco-innovation", "ecological", "emissions reduction", "endangered species", 
    "energy conservation", "energy-saving", "environmental audit", "environmental footprint", 
    "environmental justice", "green building", "green technology", "habitat conservation", 
    "land preservation", "marine conservation", "natural resource management", "ocean protection", 
    "organic farming", "ozone layer", "plant-based", "pollution prevention", "renewable resources", 
    "soil health", "solar panels", "sustainability plan", "sustainable agriculture", "toxic waste", 
    "urban greening", "vegetation", "waste reduction", "water quality", "zero emissions"
]

# Social Internal Pillar Keywords
social_internal_keywords = [
    "employee", "staff", "workplace", "benefits", "training", "compensation", "salary", 
    "corporate culture", "management", "leadership", "objectives", "career", "growth", 
    "performance", "diversity", "equality", "inclusion", "balance", "skills", 
    "loyalty", "retention", "recommendation", "supplies", "mission", "values", 
    "recognition", "feedback", "health", "safety", "protocols", "job security", 
    "accessibility", "career ladder", "collaborative work", "company retreats", "diversity training", 
    "employee advocacy", "employee engagement", "employee equity", "employee feedback", 
    "employee mentorship", "employee recognition", "employee survey", "employer branding", 
    "empowerment", "ergonomic", "fair labor", "gender balance", "health benefits", 
    "inclusive culture", "job enrichment", "labor practices", "leadership development", 
    "mental health", "organizational culture", "parental leave", "participative management", 
    "pay equity", "peer recognition", "personal development", "professional development", 
    "promotion from within", "talent development", "team building", "training opportunities", 
    "work environment", "workforce development"
]

# Social External Pillar Keywords
social_external_keywords = [
    "community", "partnership", "development", "social responsibility", "supplier", 
    "customer", "service", "brand", "reputation", "quality", "satisfaction", 
    "complaints", "public", "market", "payment", "contract", "business", "trade", 
    "commerce", "local", "employment", "CSR", "outreach", "engagement", "donation", 
    "philanthropy", "stakeholder", "accessible services", "charitable giving", "civil society", 
    "community support", "corporate philanthropy", "customer care", "economic development", 
    "ethical trade", "fair trade", "global health", "human rights", "local communities", 
    "public welfare", "social equity", "social impact", "social innovation", "stakeholder dialog", 
    "volunteerism"
]

# Governance Pillar Keywords
governance_keywords = [
    "compliance", "regulation", "policy", "standards", "integrity", "ethics", 
    "corporate governance", "audit", "law", "board", "risk management", "transparency", 
    "accountability", "best practices", "due diligence", "control", "oversight", 
    "legal", "reporting", "conduct", "trust", "corruption", "bribery", "whistleblower", 
    "data protection", "privacy", "security", "corporate law", "corporate policy", 
    "corporate responsibility", "financial regulations", "governance structure", 
    "internal controls", "risk assessment", "shareholder engagement", "sustainability reporting"
]



In [24]:
diccionario_pilares = {
    'Environmental': environmental_keywords,
    'Social Internal': social_internal_keywords,
    'Social External': social_external_keywords,
    'Governance': governance_keywords
}

df['Pilar'] = df['Review'].apply(asignar_pilar, args=(diccionario_pilares,))


In [25]:
df['Pilar'].value_counts()

Pilar
Other              4558
Social External    3208
Social Internal    1967
Environmental      1490
Governance          172
Name: count, dtype: int64

In [18]:
sentiment_mapping = {'POSITIVE': 1, 'NEGATIVE': -1, 'NEUTRAL': 0, 'MIXED': 0}
df['Sentiment_Value'] = df['MajoritySentiment'].map(sentiment_mapping)

aggregated_sentiments = df.groupby('Pilar')['Sentiment_Value'].mean().reset_index()

aggregated_sentiments_by_bank = df.groupby(['Bank Name', 'Pilar'])['Sentiment_Value'].mean().unstack(fill_value=0)

df_train=df[df['Pilar']!='Other'].reset_index(drop=True)

In [19]:
df_train['pilar_num'] = df_train['Pilar'].apply(label_to_num)

train_texts, val_texts, train_labels, val_labels = train_test_split(df_train['Review'], df_train['pilar_num'], test_size=.3)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True, max_length=512)

train_dataset = ReviewsDataset(train_encodings, train_labels.to_list())
val_dataset = ReviewsDataset(val_encodings, val_labels.to_list())

In [20]:
train_labels.value_counts()

pilar_num
2    534
1    349
0    346
3     14
Name: count, dtype: int64

In [33]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Entrenar el modelo
trainer.train()




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=234, training_loss=0.9757162240835336, metrics={'train_runtime': 3085.5436, 'train_samples_per_second': 1.209, 'train_steps_per_second': 0.076, 'total_flos': 981158743977984.0, 'train_loss': 0.9757162240835336, 'epoch': 3.0})

In [35]:
torch.save(model, "./complete_model.pth")

In [36]:
trainer.save_model("./model")


In [34]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json')

In [26]:
# Si guardaste el modelo completo
model = torch.load("./complete_model.pth")
tokenizer = BertTokenizer.from_pretrained("./tokenizer")


In [28]:
df=pd.read_csv("data/data_processed_webster.csv")
df.head()

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.32682,9.2e-05,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,5.8e-05,0.009292,2.6e-05,0.99527,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,5.9e-05,0.006926,9.8e-05,0.99638,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.05127,0.000167,0.973449,2023-09-20


In [29]:
df.dropna(inplace=True)
df.reset_index(drop=True)

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.326820,0.000092,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,0.000058,0.009292,0.000026,0.995270,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,0.000059,0.006926,0.000098,0.996380,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.051270,0.000167,0.973449,2023-09-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11390,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.004506,0.008736,0.986738,0.000020,0.497875,2024-04-24
11391,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.003693,0.008020,0.988272,0.000016,0.497829,2024-04-24
11392,"April 24, 2024 at 16:41 PM EDT Webster Financ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.005492,0.005953,0.988536,0.000018,0.499761,2024-04-24
11393,(RTTNews) - Webster Financial Corporation (WBS...,Webster Bank,news,24,EEUU,NEUTRAL,0,0.001876,0.012448,0.985664,0.000012,0.494708,2024-04-24


In [30]:
diccionario_pilares = {
    'Environmental': environmental_keywords,
    'Social Internal': social_internal_keywords,
    'Social External': social_external_keywords,
    'Governance': governance_keywords
}

df['Pilar'] = df['Review'].apply(asignar_pilar, args=(diccionario_pilares,))

In [31]:
# Asumiendo que df['Review'] contiene los textos a evaluar
texts = df['Review'].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")


In [32]:
from torch.utils.data import DataLoader, TensorDataset

# Convertir las entradas tokenizadas en un TensorDataset
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])

# Crear DataLoader
data_loader = DataLoader(dataset, batch_size=16)  # Ajusta el tamaño del lote según tu configuración y recursos


In [34]:
from tqdm import tqdm  # Importa tqdm para la barra de progreso

model.eval()  # Poner el modelo en modo de evaluación
pilares_predicciones = []

# Envolver data_loader con tqdm para visualizar la barra de progreso
for batch in tqdm(data_loader, desc="Evaluating", unit="batch"):
    input_ids, attention_mask = batch
    with torch.no_grad():  # Desactivar el cálculo del gradiente para la inferencia
        outputs = model(input_ids.to(model.device), attention_mask=attention_mask.to(model.device))
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        pilares_predicciones.extend(predictions.cpu().tolist())

# Convertir las predicciones numéricas a etiquetas de pilares
pilar_labels = {0: "Environmental", 1: "Social Internal", 2: "Social External", 3: "Governance", 4: "Other"}  # Ajustar según tus etiquetas
df['Predicted_Pilar'] = [pilar_labels[pred] for pred in pilares_predicciones]


Evaluating:   7%|▋         | 47/713 [04:50<1:08:42,  6.19s/batch]


KeyboardInterrupt: 

In [None]:
df

In [10]:
df['Predicted_Pilar'].value_counts()

Predicted_Pilar
Social External    8469
Social Internal    1836
Environmental      1099
Name: count, dtype: int64

In [23]:
df['Bank Name'].value_counts()

Bank Name
Fairfield County Bank    11238
Chelsea Gronto Bank         50
M&T Bank                    34
Webster Bank                28
Citizens Bank               22
Fifth Third Bank            12
First Horizon Bank          10
Dime Bank                   10
Name: count, dtype: int64

In [20]:
filtro=df['Pilar']=='Governance'
df['Predicted_Pilar'][filtro]='Governance'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Predicted_Pilar'][filtro]='Governance'


In [21]:
df[filtro]

Unnamed: 0,Review,Bank Name,data_source,Bank ID,Sentiment_gen,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds,Predicted_Pilar,Pilar
11,"Invest with Mathew navas now if you,re into cr...",Chelsea Gronto Bank,facebook,2,POSITIVE,0.607544,0.000665,0.391603,0.000188,0.803345,2023-10-20,Governance,Governance
97,The ladies here are great! I just recently ope...,Fairfield County Bank,maps,5,POSITIVE,0.989453,0.000139,0.009808,0.000600,0.994357,2023-03-23,Governance,Governance
105,Predatory bank! Beware! The take between 12-15...,Fairfield County Bank,maps,5,NEGATIVE,0.000668,0.967228,0.031462,0.000642,0.016399,2022-04-25,Governance,Governance
110,Bank personnel are outstanding people to deal ...,Fairfield County Bank,maps,5,POSITIVE,0.607394,0.027369,0.348821,0.016416,0.781805,2022-07-24,Governance,Governance
436,Was great help for me. They always verify it's...,Fairfield County Bank,maps,5,POSITIVE,0.998170,0.000220,0.001548,0.000061,0.998944,2021-09-19,Governance,Governance
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11153,This is the only Webster location that follows...,Fairfield County Bank,maps,5,NEUTRAL,0.032341,0.269307,0.470611,0.227742,0.267646,2015-11-25,Governance,Governance
11164,Went to cash check from selling my daughters c...,Fairfield County Bank,maps,5,NEGATIVE,0.018235,0.773483,0.204261,0.004021,0.120365,2019-04-12,Governance,Governance
11288,Opened both a checking and savings account to ...,Citizens Bank,yelp,21,NEUTRAL,0.065115,0.163499,0.770744,0.000642,0.450488,2022-05-09,Governance,Governance
11320,I usually only go to the ATM at this location....,M&T Bank,yelp,20,POSITIVE,0.982948,0.002925,0.004317,0.009809,0.985107,2019-10-11,Governance,Governance


In [22]:
df.to_csv('data_procesa_inferencia_webster.csv',index=False)

In [4]:
import pandas as pd
df=pd.read_csv('data_procesa_inferencia.csv')

In [8]:
df['Positive_gen'][df['Bank ID']==19].mean()

0.6095868099158451