In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import gc
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm  # Importa tqdm para la barra de progreso
gc.collect()
torch.cuda.empty_cache()


2024-04-29 13:30:20.092504: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def asignar_pilar(texto, diccionario_pilares):
    for pilar, palabras in diccionario_pilares.items():
        if any(palabra in texto.lower() for palabra in palabras):
            return pilar
    return "Other"

In [3]:
def label_to_num(label):
    label_dict = {"Environmental": 0, "Social Internal": 1, "Social External": 2, "Governance": 3, "Other": 4}
    return label_dict.get(label, 4)  # Default to 4 si no se encuentra

In [4]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
# Extended Keywords for Each Pillar

# Environmental Pillar Keywords
environmental_keywords = [
    "eco", "green", "sustainable", "conservation", "energy", "emissions", "climate", 
    "biodiversity", "organic", "renewable", "carbon", "recycle", "waste", "pollution", 
    "natural", "resources", "efficiency", "environmental", "protection", "wildlife", 
    "solar", "wind", "geothermal", "water", "forest", "air quality", "clean tech", "habitat", 
    "earth", "ozone", "footprint", "eco-friendly", "afforestation", "air quality", 
    "alternative energy", "animal welfare", "aquaculture", "biophilic", "carbon-neutral", 
    "clean energy", "climate policy", "composting", "deforestation", "eco-design", 
    "eco-innovation", "ecological", "emissions reduction", "endangered species", 
    "energy conservation", "energy-saving", "environmental audit", "environmental footprint", 
    "environmental justice", "green building", "green technology", "habitat conservation", 
    "land preservation", "marine conservation", "natural resource management", "ocean protection", 
    "organic farming", "ozone layer", "plant-based", "pollution prevention", "renewable resources", 
    "soil health", "solar panels", "sustainability plan", "sustainable agriculture", "toxic waste", 
    "urban greening", "vegetation", "waste reduction", "water quality", "zero emissions"
]

# Social Internal Pillar Keywords
social_internal_keywords = [
    "employee", "staff", "workplace", "benefits", "training", "compensation", "salary", 
    "corporate culture", "management", "leadership", "objectives", "career", "growth", 
    "performance", "diversity", "equality", "inclusion", "balance", "skills", 
    "loyalty", "retention", "recommendation", "supplies", "mission", "values", 
    "recognition", "feedback", "health", "safety", "protocols", "job security", 
    "accessibility", "career ladder", "collaborative work", "company retreats", "diversity training", 
    "employee advocacy", "employee engagement", "employee equity", "employee feedback", 
    "employee mentorship", "employee recognition", "employee survey", "employer branding", 
    "empowerment", "ergonomic", "fair labor", "gender balance", "health benefits", 
    "inclusive culture", "job enrichment", "labor practices", "leadership development", 
    "mental health", "organizational culture", "parental leave", "participative management", 
    "pay equity", "peer recognition", "personal development", "professional development", 
    "promotion from within", "talent development", "team building", "training opportunities", 
    "work environment", "workforce development"
]

# Social External Pillar Keywords
social_external_keywords = [
    "community", "partnership", "development", "social responsibility", "supplier", 
    "customer", "service", "brand", "reputation", "quality", "satisfaction", 
    "complaints", "public", "market", "payment", "contract", "business", "trade", 
    "commerce", "local", "employment", "CSR", "outreach", "engagement", "donation", 
    "philanthropy", "stakeholder", "accessible services", "charitable giving", "civil society", 
    "community support", "corporate philanthropy", "customer care", "economic development", 
    "ethical trade", "fair trade", "global health", "human rights", "local communities", 
    "public welfare", "social equity", "social impact", "social innovation", "stakeholder dialog", 
    "volunteerism"
]

# Governance Pillar Keywords
governance_keywords = [
    "compliance", "regulation", "policy", "standards", "integrity", "ethics", 
    "corporate governance", "audit", "law", "board", "risk management", "transparency", 
    "accountability", "best practices", "due diligence", "control", "oversight", 
    "legal", "reporting", "conduct", "trust", "corruption", "bribery", "whistleblower", 
    "data protection", "privacy", "security", "corporate law", "corporate policy", 
    "corporate responsibility", "financial regulations", "governance structure", 
    "internal controls", "risk assessment", "shareholder engagement", "sustainability reporting"
]


In [6]:
# Si guardaste el modelo completo
model = torch.load("./complete_model.pth")
tokenizer = BertTokenizer.from_pretrained("./tokenizer")

In [7]:
df=pd.read_csv("data/data_processed_webster.csv")
df.head()

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.32682,9.2e-05,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,5.8e-05,0.009292,2.6e-05,0.99527,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,5.9e-05,0.006926,9.8e-05,0.99638,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.05127,0.000167,0.973449,2023-09-20


In [8]:
df.dropna(inplace=True)
df.reset_index(drop=True)

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.326820,0.000092,0.835853,2024-03-22
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,0.000058,0.009292,0.000026,0.995270,2024-01-07
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,0.000059,0.006926,0.000098,0.996380,2023-10-29
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.051270,0.000167,0.973449,2023-09-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11390,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.004506,0.008736,0.986738,0.000020,0.497875,2024-04-24
11391,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.003693,0.008020,0.988272,0.000016,0.497829,2024-04-24
11392,"April 24, 2024 at 16:41 PM EDT Webster Financ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.005492,0.005953,0.988536,0.000018,0.499761,2024-04-24
11393,(RTTNews) - Webster Financial Corporation (WBS...,Webster Bank,news,24,EEUU,NEUTRAL,0,0.001876,0.012448,0.985664,0.000012,0.494708,2024-04-24


In [9]:
diccionario_pilares = {
    'Environmental': environmental_keywords,
    'Social Internal': social_internal_keywords,
    'Social External': social_external_keywords,
    'Governance': governance_keywords
}

df['Pilar'] = df['Review'].apply(asignar_pilar, args=(diccionario_pilares,))

In [10]:
# Asumiendo que df['Review'] contiene los textos a evaluar
texts = df['Review'].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [11]:
# Convertir las entradas tokenizadas en un TensorDataset
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])

# Crear DataLoader
data_loader = DataLoader(dataset, batch_size=16)  # Ajusta el tamaño del lote según tu configuración y recursos

In [None]:
model.eval()  # Poner el modelo en modo de evaluación
pilares_predicciones = []

# Envolver data_loader con tqdm para visualizar la barra de progreso
for batch in tqdm(data_loader, desc="Evaluating", unit="batch"):
    input_ids, attention_mask = batch
    with torch.no_grad():  # Desactivar el cálculo del gradiente para la inferencia
        outputs = model(input_ids.to(model.device), attention_mask=attention_mask.to(model.device))
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        pilares_predicciones.extend(predictions.cpu().tolist())

# Convertir las predicciones numéricas a etiquetas de pilares
pilar_labels = {0: "Environmental", 1: "Social Internal", 2: "Social External", 3: "Governance", 4: "Other"}  # Ajustar según tus etiquetas
df['Predicted_Pilar'] = [pilar_labels[pred] for pred in pilares_predicciones]


Evaluating:  25%|██▍       | 176/713 [17:36<53:17,  5.95s/batch]  

In [29]:
df['Predicted_Pilar'].value_counts()

Predicted_Pilar
Social External    8307
Social Internal    1835
Environmental      1081
Governance          172
Name: count, dtype: int64

In [None]:
df['Bank Name'].value_counts()

In [None]:
filtro=df['Pilar']=='Governance'
df['Predicted_Pilar'][filtro]='Governance'

In [None]:
df.to_csv('data_procesa_inferencia_webster.csv',index=False)

In [13]:
import pandas as pd 
df=pd.read_csv('data_procesa_inferencia_webster.csv')

In [14]:
df

Unnamed: 0,Review,Bank Name,data_source,Bank ID,state,Sentiment_gen,POSITIVE_Count,Positive_gen,Negative_gen,Neutral_gen,Mixed_gen,Sentiment_Score,ds,Pilar,Predicted_Pilar
0,Mere words cannot express the deep felt gratit...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.870884,0.027722,0.098281,0.003113,0.920025,2024-04-18,Other,Social External
1,i came across Mrs. Jennifer when she appeared ...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.672443,0.000645,0.326820,0.000092,0.835853,2024-03-22,Other,Social External
2,"Excellent Service""! Accountant is kind and ver...",Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.990623,0.000058,0.009292,0.000026,0.995270,2024-01-07,Environmental,Environmental
3,Good brokerage and trading platform. I love th...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.992917,0.000059,0.006926,0.000098,0.996380,2023-10-29,Environmental,Environmental
4,REGINA NATHAN is a TOP NOTCH investing compan...,Chelsea Gronto Bank,facebook,2,EEUU,POSITIVE,0,0.947814,0.000749,0.051270,0.000167,0.973449,2023-09-20,Environmental,Environmental
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11390,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.004506,0.008736,0.986738,0.000020,0.497875,2024-04-24,Environmental,Environmental
11391,"Webster Financial Corporation (NYSE:WBS), the ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.003693,0.008020,0.988272,0.000016,0.497829,2024-04-24,Environmental,Environmental
11392,"April 24, 2024 at 16:41 PM EDT Webster Financ...",Webster Bank,news,24,EEUU,NEUTRAL,0,0.005492,0.005953,0.988536,0.000018,0.499761,2024-04-24,Environmental,Environmental
11393,(RTTNews) - Webster Financial Corporation (WBS...,Webster Bank,news,24,EEUU,NEUTRAL,0,0.001876,0.012448,0.985664,0.000012,0.494708,2024-04-24,Other,Environmental


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11395 entries, 0 to 11394
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Review           11395 non-null  object 
 1   Bank Name        11395 non-null  object 
 2   data_source      11395 non-null  object 
 3   Bank ID          11395 non-null  int64  
 4   state            11395 non-null  object 
 5   Sentiment_gen    11395 non-null  object 
 6   POSITIVE_Count   11395 non-null  int64  
 7   Positive_gen     11395 non-null  float64
 8   Negative_gen     11395 non-null  float64
 9   Neutral_gen      11395 non-null  float64
 10  Mixed_gen        11395 non-null  float64
 11  Sentiment_Score  11395 non-null  float64
 12  ds               11395 non-null  object 
 13  Pilar            11395 non-null  object 
 14  Predicted_Pilar  11395 non-null  object 
dtypes: float64(5), int64(2), object(8)
memory usage: 1.3+ MB


In [16]:
df['ds']=pd.to_datetime(df['ds'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11395 entries, 0 to 11394
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Review           11395 non-null  object        
 1   Bank Name        11395 non-null  object        
 2   data_source      11395 non-null  object        
 3   Bank ID          11395 non-null  int64         
 4   state            11395 non-null  object        
 5   Sentiment_gen    11395 non-null  object        
 6   POSITIVE_Count   11395 non-null  int64         
 7   Positive_gen     11395 non-null  float64       
 8   Negative_gen     11395 non-null  float64       
 9   Neutral_gen      11395 non-null  float64       
 10  Mixed_gen        11395 non-null  float64       
 11  Sentiment_Score  11395 non-null  float64       
 12  ds               11395 non-null  datetime64[ns]
 13  Pilar            11395 non-null  object        
 14  Predicted_Pilar  11395 non-null  objec

In [17]:
df['year']=df['ds'].dt.year
df['month']=df['ds'].dt.month

In [26]:
df_agru=df.groupby(["Predicted_Pilar","ds","Bank Name"])[["Sentiment_Score","Positive_gen","Negative_gen","Neutral_gen"]].mean().reset_index()

In [27]:
df_agru['Sentiment_Score']=df_agru['Sentiment_Score']*10
df_agru['Positive_gen']=df_agru['Positive_gen']*10
df_agru['Negative_gen']=df_agru['Negative_gen']*10
df_agru['Neutral_gen']=df_agru['Neutral_gen']*10
df_agru

Unnamed: 0,Predicted_Pilar,ds,Bank Name,Sentiment_Score,Positive_gen,Negative_gen,Neutral_gen
0,Environmental,2013-01-31,Dime Bank,0.294088,0.119529,9.475985,0.349118
1,Environmental,2013-02-06,Citizens Bank,9.991358,9.984297,0.000754,0.014123
2,Environmental,2013-02-13,Dime Bank,0.648620,0.127206,8.783980,1.042828
3,Environmental,2013-11-04,Fifth Third Bank,9.992477,9.985669,0.000387,0.013616
4,Environmental,2014-01-15,Fifth Third Bank,9.997910,9.996380,0.000442,0.003060
...,...,...,...,...,...,...,...
7432,Social Internal,2024-04-16,First Horizon Bank,1.033090,0.340573,8.022445,1.385034
7433,Social Internal,2024-04-18,Fifth Third Bank,0.205589,0.188133,8.419058,0.034911
7434,Social Internal,2024-04-19,Fifth Third Bank,4.886815,4.813117,5.013260,0.147397
7435,Social Internal,2024-04-20,Dime Bank,6.381322,6.269472,2.988410,0.223701


In [30]:
df_agru.value_counts()

Predicted_Pilar  ds          Bank Name           Sentiment_Score  Positive_gen  Negative_gen  Neutral_gen
Social Internal  2024-04-22  Dime Bank           9.996792         9.994143      0.000407      0.005297       1
Environmental    2013-01-31  Dime Bank           0.294088         0.119529      9.475985      0.349118       1
                 2013-02-06  Citizens Bank       9.991358         9.984297      0.000754      0.014123       1
                 2013-02-13  Dime Bank           0.648620         0.127206      8.783980      1.042828       1
                 2013-11-04  Fifth Third Bank    9.992477         9.985669      0.000387      0.013616       1
                                                                                                            ..
                 2016-05-06  First Horizon Bank  9.690293         9.384655      0.003963      0.611277       1
                 2016-05-20  Citizens Bank       9.950758         9.904508      0.001632      0.092499       1
      

In [28]:
df_agru.to_excel("df_agrupado_webster.xlsx",index=False, float_format='%.4f')

In [24]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
