# Sentiment analysis using `camemBERT`

`camemBERT` is a pre-trained version of `roBERTa` on french language data. The objective is to use pre-trained `camemBERT` to predict the polarity (positive or negative) of tweets. We only focus on model evaluation since we do not have labelled data. 

## Setup

In [4]:
# # google colab setup 

# from google.colab import drive
# drive.mount("/content/drive")

# DRIVE_PATH = "/content/drive/MyDrive/twitter-inflation-perception/"

# import os
# os.chdir(DRIVE_PATH+"notebooks/")

import sys 
sys.path.append("../")

In [2]:
# # libraries to install in google colab

# !pip install transformers==4.25.1
# !pip install sentencepiece
# !pip install rich

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from lib.sentiment.preprocessing import (
    load_tokenizer, 
    preprocess, 
    train_val_split
)
from lib.sentiment.model import load_model, backup_model 

from lib.sentiment.training import (
    train, 
    init_scheduler, 
    check_convergence
)
from lib.sentiment.validation import (
    evaluate, 
    predict
) 

from lib.sentiment.utils import (
    results_to_dict, 
    get_avg_training_losses, 
    tensor_to_numpy 
)

In [6]:
import json
import pickle as pkl

import numpy as np
import pandas as pd
from itertools import chain

from tqdm import tqdm

from rich.table import Table 
from rich.console import Console

In [7]:
import torch
from torch.utils.data import (
    TensorDataset, 
    random_split, 
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)

In [6]:
from transformers import AdamW

## Data

In [10]:
file_path = DRIVE_PATH + "backup/data/french_tweets.csv"
french_tweets = pd.read_csv(file_path)

In [None]:
french_tweets.head()

Unnamed: 0,label,text
0,0,"- Awww, c'est un bummer. Tu devrais avoir davi..."
1,0,Est contrarié qu'il ne puisse pas mettre à jou...
2,0,J'ai plongé plusieurs fois pour la balle. A ré...
3,0,Tout mon corps a des démangeaisons et comme si...
4,0,"Non, il ne se comporte pas du tout. je suis en..."


In [None]:
n_tweets, _ = french_tweets.shape
print(f"{n_tweets} tweets in the dataset")

1526724 tweets in the dataset


In [None]:
french_tweets["label"].value_counts() / n_tweets

0    0.505398
1    0.494602
Name: label, dtype: float64

In [None]:
# # extract 10% samples to reduce computation time 

# prop = .1
# size = int(n_tweets * prop) 
# idxs = np.random.randint(low=0, high=n_tweets, size=size).tolist()

# tweets_sample = french_tweets.iloc[idxs, :]

# tweets_sample.to_csv("../backup/data/french_tweets_sample.csv", index=False)

In [11]:
tweets_sample = pd.read_csv("../backup/data/french_tweets_sample.csv")

In [None]:
tweets_sample["label"].value_counts() / len(tweets_sample)

0    0.506006
1    0.493994
Name: label, dtype: float64

In [None]:
tweets = tweets_sample["text"].values.tolist()
sentiments = tweets_sample["label"].values.tolist()

## Preprocessing

In [None]:
tokenizer = load_tokenizer()

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

In [None]:
type(tokenizer)

transformers.models.camembert.tokenization_camembert.CamembertTokenizer

In [None]:
# # train / validation split and save as pickle file

# tweets_train, tweets_validation, sentiments_train, sentiments_validation = train_val_split(tweets, sentiments, train_prop=.8)

In [None]:
# # create train and validation datasets from tokenized tweets

# input_ids, attention_mask, sentiments_train = preprocess(tweets_train, tokenizer, sentiments=sentiments_train)

# train_dataset = TensorDataset(
#     input_ids,
#     attention_mask,
#     sentiments_train)

# torch.save(train_dataset, "../backup/models/train_dataset.pt")


# input_ids, attention_mask, sentiments_validation = preprocess(tweets_validation, tokenizer, sentiments=sentiments_validation)

# validation_dataset = TensorDataset(
#     input_ids,
#     attention_mask,
#     sentiments_validation)

# torch.save(validation_dataset, "../backup/models/validation_dataset.pt")



In [12]:
train_dataset = torch.load("../backup/models/train_dataset.pt")
validation_dataset = torch.load("../backup/models/validation_dataset.pt")

In [13]:
batch_size = 64

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)

validation_dataloader = DataLoader(
            validation_dataset,
            sampler = SequentialSampler(validation_dataset),
            batch_size = batch_size)

## Model

### Load `camemBERT`

In [None]:
model = load_model()

# initialize a variable holding the device used for training ('cpu' or 'cuda')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"{device=}")
model = model.to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 

device=device(type='cuda', index=0)


In [None]:
n_params = sum(p.numel() for p in model.parameters())
print("{:,} parameters in camemBERT".format(n_params) )

110,623,490 parameters in camemBERT


### Training & validation

In [None]:
# we only train the model for 3 epochs to avoid a too long training time

num_epochs = 3

In [None]:
statistics = []

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = init_scheduler(num_epochs, train_dataloader, optimizer)

model_path = "../backup/models/twitter-camembert.pt"

# this variable will evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0



In [None]:
for epoch in range(num_epochs):
    
    batch_losses, training_times = train(
        model, 
        train_dataloader, 
        device, 
        optimizer, 
        scheduler, 
        epoch, 
        num_epochs)
    
    backup_model(model, model_path)

    # if num_epochs > 3 and epoch > 1: 
    #     curr_loss =  np.mean(batch_losses)
    #     avg_train_losses = get_avg_training_losses(statistics)

    #     consecutive_epochs_with_no_improve = check_convergence(
    #         model, 
    #         model_path, 
    #         avg_train_losses, 
    #         curr_loss, 
    #         consecutive_epochs_with_no_improve)
        
    #     if consecutive_epochs_with_no_improve == 2:
    #       print("Stop training: The loss has not changed since 2 epochs!")
    #       break

    accuracy_scores = evaluate(model, validation_dataloader, device)
    statistics.append(results_to_dict(epoch, batch_losses, training_times, accuracy_scores))

Training Epoch [1/3]: 100%|██████████| 1909/1909 [35:51<00:00,  1.13s/it, loss_train=0.63, training_time=1077.41]


Model saved at ../backup/models/twitter-camembert.pt


Validation in progress: 100%|██████████| 478/478 [02:46<00:00,  2.87it/s, balanced_accuracy_score=0.81]
Training Epoch [2/3]: 100%|██████████| 1909/1909 [35:46<00:00,  1.12s/it, loss_train=0.25, training_time=1073.51]


Model saved at ../backup/models/twitter-camembert.pt


Validation in progress: 100%|██████████| 478/478 [02:45<00:00,  2.89it/s, balanced_accuracy_score=0.82]
Training Epoch [3/3]: 100%|██████████| 1909/1909 [35:47<00:00,  1.12s/it, loss_train=0.56, training_time=1074.02]


Model saved at ../backup/models/twitter-camembert.pt


Validation in progress: 100%|██████████| 478/478 [02:45<00:00,  2.89it/s, balanced_accuracy_score=0.83]


In [None]:
training_stats_path = "../backup/models/training-stats-camembert.json"

# with open(training_stats_path, "w") as f:
#     json.dump(statistics, f) 

with open(training_stats_path, "r") as f:
    statistics = json.load(f) 

In [29]:
table = Table(title="Training / validation average statistics")

table.add_column("Epoch")
table.add_column("Training loss")
table.add_column("Validation balanced accuracy")

for stat in statistics: 
    avg_train_loss = np.mean(stat["train_batch_losses"])
    avg_val_score = np.mean(stat["val_batch_scores"])
    table.add_row(
        str(stat["epoch"]+1), 
        str(round(avg_train_loss, 3)), 
        str(round(avg_val_score, 3))
    ) 

console = Console()
console.print(table)

## Evaluation on unseen data

In this section, the model is used to predict the sentiment of the tweets related to inflation.

### Load model and tweets to label

In [7]:
model_path = "../backup/models/twitter-camembert.pt"
model = load_model(model_path)

Loading trained model...


In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"{device=}")

model = model.to(device)

device=device(type='cuda', index=0)


In [9]:
path = "../backup/data/tweets_preprocessed.pkl"
with open(path, "rb") as f: 
    new_tweets = pkl.load(f)

new_tweets = new_tweets["cleaned"]

In [10]:
n_tweets = len(new_tweets)
print(f"{n_tweets} new tweets to label")

92961 new tweets to label


In [11]:
ix = np.random.randint(low=0, high=n_tweets)

print(new_tweets[ix])

#COVID-19  Pourquoi l’inflation a-t-elle baissé alors que les ménages s’attendaient à des hausses de prix? Banque de France  [Blog] During the lockdown, #inflation in France fell sharply while households expected sharp price increases. The sudden change in #household structure of #consumption and the strong dispersion of price changes can explain this divergence   #BdfEco


### Preprocessing

In [12]:
tokenizer = load_tokenizer()

input_ids, attention_mask = preprocess(new_tweets, tokenizer)

In [15]:
batch_size = 32
dataset = TensorDataset(
    input_ids,
    attention_mask)

dataloader = DataLoader(
    dataset,
    sampler = RandomSampler(dataset),
    batch_size = batch_size)

### Run model

In [21]:
results = []
backup_path = "../backup/models/predictions.json"

loop = tqdm(dataloader) 
loop.set_description("Predicting unseen tweets in progress")

for batch in loop:             
    input_id = batch[0].to(device)
    attention_mask = batch[1].to(device)
    
    predicted_labels, scores = predict(input_id, attention_mask, model, eval_mode=False)
    predicted_labels = tensor_to_numpy(predicted_labels)
    scores = tensor_to_numpy(scores)

    results.append({
        "predicted_labels": predicted_labels.tolist(), 
        "softmax_scores": scores.tolist()
    })

    with open(backup_path, "w") as f: 
        json.dump(results, f)

Predicting unseen tweets in progress: 100%|██████████| 2906/2906 [31:17<00:00,  1.55it/s]


### Save objects

In [2]:
backup_path = "../backup/models/predictions.json"
with open(backup_path, "r") as f: 
        results = json.load(f)

In [9]:
# flatten predictions and store in numpy array

predicted_labels = list(chain(
    *[item["predicted_labels"] for item in results]
))

predicted_labels = np.array(predicted_labels)

In [17]:
# flatten softmax scores and store in numpy array

scores = list(chain(
    *[np.max(item["softmax_scores"], axis=1) for item in results]
))

scores = np.array(scores)

In [20]:
# # save predicted sentiments and scores as numpy arrays

# with open("../backup/models/sentiments.npy", "wb") as f:
#     np.save(f, predicted_labels)

# with open("../backup/models/sentiment_scores.npy", "wb") as f:
#     np.save(f, scores)  

### Quick check 

In [8]:
with open("../backup/models/sentiments.npy", "rb") as f:
    predicted_labels = np.load(f)

with open("../backup/models/sentiment_scores.npy", "rb") as f:
    scores = np.load(f)  

In [9]:
path = "../backup/data/tweets_preprocessed.pkl"

with open(path, "rb") as f: 
    tweets = pkl.load(f)["cleaned"]

In [11]:
n_tweets = len(tweets)

ix = np.random.randint(low=0, high=n_tweets)

print(tweets[ix])
print(f"Sentiment={predicted_labels[ix]}")
print(f"Score={scores[ix]}")

La livre turque a progressé de +4,6 % à 0,1179 euro cette semaine. Et ce en lien avec des spéculations concernant un prochain tour de vis monétaire de la part de la Banque Centrale Turque (TCMB).  Ces spéculations sont en lien direct avec l'accélération de l'inflation,
Sentiment=1
Score=0.9700846076011658
