In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install openpyxl

In [1]:
import pandas as pd

file_path = '/content/FR.xlsx'
df = pd.read_excel(file_path)

In [2]:
# Calculate the total number of news content
total_news_content = len(df)

# Calculate unique news titles


# Impact Duration Distribution based on the exact values of 0, 1, and 2
impact_duration_distribution = df['impact_duration'].value_counts().sort_index()

# Impact Level Distribution
impact_level_distribution = df['impact_level'].value_counts()

# Output the calculated values
print(f"Total News Content: {total_news_content}")

print("Impact Duration Distribution:")
print(impact_duration_distribution)
print("Impact Level Distribution:")
print(impact_level_distribution)

Total News Content: 10104
Impact Duration Distribution:
0    3192
1    3348
2    3564
Name: impact_duration, dtype: int64
Impact Level Distribution:
low       4773
medium    4726
high       605
Name: impact_level, dtype: int64


In [3]:
!pip install transformers
!pip install sklearn
!pip install scikit-learn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.data import Dataset
import torch

# Encode the 'impact_duration' labels to numeric values
label_encoder = LabelEncoder()
df['impact_duration_encoded'] = label_encoder.fit_transform(df['impact_duration'])
# Check for NaN values in 'news_content'
if df['news_content'].isnull().any():
    # Handle NaN values, e.g., by replacing them with a placeholder string
    df['news_content'].fillna('No content', inplace=True)

# Split the data into training and validation sets (90% training, 10% validation)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_data(df):
    # Ensure all entries are strings and handle missing values if necessary
    texts = df['news_content'].astype(str).tolist()  # Convert to string to avoid issues
    return tokenizer(texts, truncation=True, padding=True, max_length=512)


train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)

train_labels = train_df['impact_duration_encoded'].tolist()
val_labels = val_df['impact_duration_encoded'].tolist()

# Create a custom dataset for PyTorch
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# Load the RoBERTa model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

!pip install transformers -U
!pip install accelerate -U
!pip install scikit-learn



In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
)

# Compute metrics function for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
evaluation_results = trainer.evaluate()

print(evaluation_results)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4085,0.524287,0.858556,0.859857,0.877763,0.858556
2,0.2843,0.288305,0.939664,0.939606,0.939592,0.939664
3,0.1361,0.226402,0.952522,0.95254,0.952584,0.952522


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.22640159726142883, 'eval_accuracy': 0.9525222551928784, 'eval_f1': 0.9525400828331279, 'eval_precision': 0.9525842160589271, 'eval_recall': 0.9525222551928784, 'eval_runtime': 6.8511, 'eval_samples_per_second': 147.568, 'eval_steps_per_second': 9.342, 'epoch': 3.0}


In [8]:

# Generate predictions for the training dataset
train_predictions = trainer.predict(train_dataset)

# Generate predictions for the validation dataset
val_predictions = trainer.predict(val_dataset)

# The predictions are in logits, so you need to apply softmax to convert to probabilities and then take the argmax to get the predicted labels
train_pred_labels = np.argmax(train_predictions.predictions, axis=1)
val_pred_labels = np.argmax(val_predictions.predictions, axis=1)

# The true labels are already provided as part of the datasets
train_true_labels = train_predictions.label_ids
val_true_labels = val_predictions.label_ids

# Calculate accuracy for the training dataset
train_accuracy = accuracy_score(train_true_labels, train_pred_labels)

# Calculate accuracy for the validation dataset
val_accuracy = accuracy_score(val_true_labels, val_pred_labels)

# To print classification reports, you'll need to import it
from sklearn.metrics import classification_report

# Convert the label encoder's classes to strings for use as target names
target_names = [str(label) for label in label_encoder.inverse_transform(range(len(label_encoder.classes_)))]

# Then use these target names in your classification report
print("Training Dataset Metrics:")
print(f"Accuracy: {train_accuracy}")
print(classification_report(train_true_labels, train_pred_labels, target_names=target_names))

print("Validation Dataset Metrics:")
print(f"Accuracy: {val_accuracy}")
print(classification_report(val_true_labels, val_pred_labels, target_names=target_names))



Training Dataset Metrics:
Accuracy: 0.9861431870669746
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2862
           1       0.99      0.98      0.98      3037
           2       0.99      0.99      0.99      3194

    accuracy                           0.99      9093
   macro avg       0.99      0.99      0.99      9093
weighted avg       0.99      0.99      0.99      9093

Validation Dataset Metrics:
Accuracy: 0.9525222551928784
              precision    recall  f1-score   support

           0       0.96      0.95      0.95       330
           1       0.94      0.95      0.94       311
           2       0.96      0.96      0.96       370

    accuracy                           0.95      1011
   macro avg       0.95      0.95      0.95      1011
weighted avg       0.95      0.95      0.95      1011



In [9]:
!pip install openpyxl



In [10]:
import pandas as pd

# Load the test dataset
test_dataset_path = '/content/FR.xlsx'
test_df = pd.read_excel(test_dataset_path)

# Display the first few rows of the test dataset to understand its structure
test_df.head()


Unnamed: 0,impact_level,news_content,impact_duration
0,high,Des protestataires ont occupé temporairement d...,0
1,high,Les protestataires ont désigné le ministère co...,0
2,high,Sous les directives de l'ancien général Sami R...,0
3,high,"Lors d'opérations apparemment coordonnées, des...",0
4,high,Le ministère des Affaires étrangères a été pri...,0


In [11]:
from transformers import BertTokenizer
from torch.utils.data import Dataset
import torch
import numpy as np


# Assuming you have already loaded your model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the test dataset
def tokenize_data(df):
    texts = df['news_content'].astype(str).tolist()  # Ensure all entries are strings
    return tokenizer(texts, truncation=True, padding=True, max_length=512)

test_encodings = tokenize_data(test_df)

# Dummy labels for creating the Dataset object, since labels are not needed for prediction
test_labels = [0] * len(test_df)

class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_encodings)

# Predict using the trained model
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)
original_labels = label_encoder.inverse_transform(predicted_labels)

# Add predictions to the test DataFrame
test_df['predicted_impact_duration'] = original_labels
import os
print(os.getcwd())

# Save the updated DataFrame to a new JSON file in the current working directory
output_filename = 'updated_test_dataset_with_predictions.json'
test_df.to_json(output_filename, orient='records', lines=True)

print(f"Updated test dataset saved to {output_filename}")


/content
Updated test dataset saved to updated_test_dataset_with_predictions.json


In [15]:
import pandas as pd

# Replace 'your_predictions_file.json' with the path to your JSON prediction file
file_path = '/content/updated_test_dataset_with_predictions.json'
df = pd.read_json(file_path, lines=True)  # Use lines=True if your JSON is in newline-delimited format

# View the first few rows of the DataFrame
print(df.head())

  impact_level                                       news_content  \
0         high  Des protestataires ont occupé temporairement d...   
1         high  Les protestataires ont désigné le ministère co...   
2         high  Sous les directives de l'ancien général Sami R...   
3         high  Lors d'opérations apparemment coordonnées, des...   
4         high  Le ministère des Affaires étrangères a été pri...   

   impact_duration  predicted_impact_duration  
0                0                          0  
1                0                          0  
2                0                          0  
3                0                          0  
4                0                          0  


In [16]:
import pandas as pd

# Replace 'your_predictions_file.json' with the path to your JSON prediction file
file_path = '/content/updated_test_dataset_with_predictions.json'
df = pd.read_json(file_path, lines=True)  # Use lines=True if your JSON is in newline-delimited format

# View the first few rows of the DataFrame
print(df.head())

  impact_level                                       news_content  \
0         high  Des protestataires ont occupé temporairement d...   
1         high  Les protestataires ont désigné le ministère co...   
2         high  Sous les directives de l'ancien général Sami R...   
3         high  Lors d'opérations apparemment coordonnées, des...   
4         high  Le ministère des Affaires étrangères a été pri...   

   impact_duration  predicted_impact_duration  
0                0                          0  
1                0                          0  
2                0                          0  
3                0                          0  
4                0                          0  


In [18]:

# Assuming df is your DataFrame after loading the dataset
# Replace '/path/to/your/dataset.json' with the path to your dataset

file_path = '/content/FR.xlsx'
df = pd.read_excel(file_path)


# Encode the 'impact_duration' labels to numeric values
label_encoder = LabelEncoder()
df['impact_level_encoded'] = label_encoder.fit_transform(df['impact_level'])
# Check for NaN values in 'news_content'
if df['news_content'].isnull().any():
    # Handle NaN values, e.g., by replacing them with a placeholder string
    df['news_content'].fillna('No content', inplace=True)

# Split the data into training and validation sets (90% training, 10% validation)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_data(df):
    # Ensure all entries are strings and handle missing values if necessary
    texts = df['news_content'].astype(str).tolist()  # Convert to string to avoid issues
    return tokenizer(texts, truncation=True, padding=True, max_length=512)


train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)

train_labels = train_df['impact_level_encoded'].tolist()
val_labels = val_df['impact_level_encoded'].tolist()

# Create a custom dataset for PyTorch
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(label_encoder.classes_))


import os
from transformers import TrainingArguments, Trainer
import numpy as np

# Specify the directory to save checkpoints
checkpoint_dir = '/content/drive/MyDrive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL'
# Specify the directory to save predicted results
predicted_results_dir = '/content/drive/MyDrive/PREDICTED RESULTS_ROBERTA_BASE_IMPACT_LEVEL'

# Create the directories if they do not exist
os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(predicted_results_dir, exist_ok=True)

# Define training arguments, including the checkpoint directory
training_args = TrainingArguments(
    output_dir=checkpoint_dir,  # Checkpoints directory
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_steps=500,  # Save a checkpoint every 500 steps
    save_total_limit=3  # Keep only the last 3 checkpoints
)

# Initialize the Trainer with the training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4414,0.245676,0.923838,0.921833,0.925363,0.923838


Checkpoint destination directory /content/drive/MyDrive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1137, training_loss=0.5205163074986602, metrics={'train_runtime': 247.4284, 'train_samples_per_second': 36.75, 'train_steps_per_second': 4.595, 'total_flos': 1742966571602466.0, 'train_loss': 0.5205163074986602, 'epoch': 1.0})

In [19]:
!ls "/content/drive/My Drive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL"

ls: cannot access '/content/drive/My Drive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL': No such file or directory


In [20]:
# Load the best model from checkpoint for evaluation
best_model_path = '/content/drive/My Drive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL/checkpoint-1000'
model = BertForSequenceClassification.from_pretrained(best_model_path, num_labels=len(label_encoder.classes_))


# Evaluate the model on training and validation datasets
train_results = trainer.evaluate(train_dataset)
val_results = trainer.evaluate(val_dataset)

# Print the evaluation results
print("Training dataset performance:", train_results)
print("Validation dataset performance:", val_results)

OSError: Incorrect path_or_model_id: '/content/drive/My Drive/CHECKPOINTDIR_ROBERTABASE_IMPACT_LEVEL/checkpoint-1000'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [21]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming the label encoder has been fitted with your class names
target_names = label_encoder.classes_

# Training Dataset Metrics
train_accuracy = accuracy_score(train_true_labels, train_pred_labels)
print("Training Dataset Metrics:")
print(f"Accuracy: {train_accuracy}")
print(classification_report(train_true_labels, train_pred_labels, target_names=target_names))

# Validation Dataset Metrics
val_accuracy = accuracy_score(val_true_labels, val_pred_labels)
print("Validation Dataset Metrics:")
print(f"Accuracy: {val_accuracy}")
print(classification_report(val_true_labels, val_pred_labels, target_names=target_names))


Training Dataset Metrics:
Accuracy: 0.9861431870669746
              precision    recall  f1-score   support

        high       0.99      0.99      0.99      2862
         low       0.99      0.98      0.98      3037
      medium       0.99      0.99      0.99      3194

    accuracy                           0.99      9093
   macro avg       0.99      0.99      0.99      9093
weighted avg       0.99      0.99      0.99      9093

Validation Dataset Metrics:
Accuracy: 0.9525222551928784
              precision    recall  f1-score   support

        high       0.96      0.95      0.95       330
         low       0.94      0.95      0.94       311
      medium       0.96      0.96      0.96       370

    accuracy                           0.95      1011
   macro avg       0.95      0.95      0.95      1011
weighted avg       0.95      0.95      0.95      1011



In [22]:
import pandas as pd

# Load the test dataset
test_dataset_path = '/content/ML-ESG3_Testset_French.json'
test_df = pd.read_json(test_dataset_path)

# Display the first few rows of the test dataset to understand its structure
test_df.head()

from transformers import BertTokenizer
from torch.utils.data import Dataset
import torch
import numpy as np


# Assuming you have already loaded your model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the test dataset
def tokenize_data(df):
    texts = df['news_content'].astype(str).tolist()  # Ensure all entries are strings
    return tokenizer(texts, truncation=True, padding=True, max_length=512)

test_encodings = tokenize_data(test_df)

# Dummy labels for creating the Dataset object, since labels are not needed for prediction
test_labels = [0] * len(test_df)

class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_encodings)

# Predict using the trained model
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)
original_labels = label_encoder.inverse_transform(predicted_labels)

# Add predictions to the test DataFrame
test_df['predicted_impact_level'] = original_labels
import os
print(os.getcwd())

# Save the updated DataFrame to a new JSON file in the current working directory
output_filename = 'updated_test_dataset_with_predictions_impact_level.json'
test_df.to_json(output_filename, orient='records', lines=True)

print(f"Updated test dataset saved to {output_filename}")


/content
Updated test dataset saved to updated_test_dataset_with_predictions_impact_level.json


In [23]:
import pandas as pd

# Replace 'your_predictions_file.json' with the path to your JSON prediction file
file_path = 'updated_test_dataset_with_predictions_impact_level.json'
df = pd.read_json(file_path, lines=True)  # Use lines=True if your JSON is in newline-delimited format

# View the first few rows of the DataFrame
print(df.head())

                                                 URL  \
0  https://www.novethic.fr/actualite/energie/tran...   
1  https://www.novethic.fr/actualite/energie/ener...   
2  https://www.novethic.fr/actualite/energie/ener...   
3  https://www.novethic.fr/actualite/energie/mobi...   
4  https://www.novethic.fr/actualite/energie/mobi...   

                                          news_title  \
0  La France porte un projet de pipeline qui tran...   
1  Accélération des énergies renouvelables : les ...   
2  Accélération des énergies renouvelables : les ...   
3  L’industrie automobile en route vers l’électri...   
4  Le secteur ferroviaire s'organise pour rendre ...   

                                        news_content predicted_impact_level  
0  L’industrie constitue le fer de lance de la ma...                 medium  
1  L’examen du projet de loi sur l’accélération d...                   high  
2  3/ Le dernier mot aux maires C’était un autre ...                 medium  
3  L’industrie