# Sentiment analysis using `camemBERT`

`camemBERT` is a pre-trained version of `roBERTa` on french language data. The objective is to use pre-trained `camemBERT` to predict the polarity (positive or negative) of tweets. We only focus on model evaluation since we do not have labelled data. 

## Setup

In [1]:
# configuration to use the notebook on google collab

# from google.colab import drive
# drive.mount("/content/drive")

# DRIVE_PATH = "/content/drive/MyDrive/twitter-inflation-perception/"

# import os
# os.chdir(DRIVE_PATH+"notebooks/")

# !pip install transformers==4.25.1
# !pip install sentencepiece

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys 
sys.path.append("../")

In [4]:
from lib.sentiment.preprocessing import (
    load_tokenizer, 
    preprocess, 
    train_val_split
)
from lib.sentiment.model import load_model, backup_model 

from lib.sentiment.training import (
    train, 
    init_scheduler, 
    check_convergence
)
from lib.sentiment.validation import evaluate 

from lib.sentiment.utils import results_to_dict, get_avg_training_losses

In [5]:
import os
import json

import time
import datetime

import numpy as np
import pandas as pd
import pickle as pkl

import matplotlib.pyplot as plt
from sklearn import metrics

In [6]:
import torch
from torch.utils.data import (
    TensorDataset, 
    random_split, 
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)

In [7]:
from transformers import AdamW

## Data

In [8]:
file_path = DRIVE_PATH + "backup/data/french_tweets.csv"
french_tweets = pd.read_csv(file_path)

In [9]:
french_tweets.head()

Unnamed: 0,label,text
0,0,"- Awww, c'est un bummer. Tu devrais avoir davi..."
1,0,Est contrarié qu'il ne puisse pas mettre à jou...
2,0,J'ai plongé plusieurs fois pour la balle. A ré...
3,0,Tout mon corps a des démangeaisons et comme si...
4,0,"Non, il ne se comporte pas du tout. je suis en..."


In [10]:
n_tweets, _ = french_tweets.shape
print(f"{n_tweets} tweets in the dataset")

1526724 tweets in the dataset


In [None]:
french_tweets["label"].value_counts() / n_tweets

In [11]:
# extract sample to reduce computation time 

prop = .1
size = int(n_tweets * prop) 
idxs = np.random.randint(low=0, high=n_tweets, size=size).tolist()

tweets_sample = french_tweets.iloc[idxs, :]

print(len(tweets_sample))

152672


In [12]:
tweets_sample["label"].value_counts() / len(tweets_sample)

0    0.507212
1    0.492788
Name: label, dtype: float64

In [13]:
tweets = tweets_sample["text"].values.tolist()
sentiments = tweets_sample["label"].values.tolist()

## Preprocessing

In [14]:
tokenizer = load_tokenizer()

In [19]:
type(tokenizer)

transformers.models.camembert.tokenization_camembert.CamembertTokenizer

In [20]:
tweets_train, tweets_validation, sentiments_train, sentiments_validation = train_val_split(tweets, sentiments, train_prop=.8)

In [21]:
input_ids, attention_mask, sentiments_train = preprocess(tweets_train, tokenizer, sentiments=sentiments_train)

train_dataset = TensorDataset(
    input_ids,
    attention_mask,
    sentiments_train)



In [23]:
input_ids, attention_mask, sentiments_validation = preprocess(tweets_validation, tokenizer, sentiments=sentiments_validation)

validation_dataset = TensorDataset(
    input_ids,
    attention_mask,
    sentiments_validation)

In [24]:
batch_size = 64

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size)

validation_dataloader = DataLoader(
    validation_dataset,
    sampler = SequentialSampler(validation_dataset),
    batch_size = batch_size)

## Model

### Load `camemBERT`

In [25]:
model = load_model()

# initialize a variable holding the device used for training ('cpu' or 'cuda')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"{device=}")
model = model.to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias

device=device(type='cuda', index=0)


In [26]:
n_params = sum(p.numel() for p in model.parameters())
print("{:,} parameters in camemBERT".format(n_params) )

110,623,490 parameters in camemBERT


### Training & validation

In [None]:
# list to store training results 
statistics = []

num_epochs = 4

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = init_scheduler(num_epochs, train_dataloader, optimizer)

model_path = "../backup/models/twitter-camembert.pt"

# to evaluate the convergence on the training
consecutive_epochs_with_no_improve = 0

In [45]:
for epoch in (2, 3):
    
    batch_losses, training_times = train(
        model, 
        train_dataloader, 
        device, 
        optimizer, 
        scheduler, 
        epoch, 
        num_epochs)

    if num_epochs > 3 and epoch > 1: 
        curr_loss =  np.mean(batch_losses)
        avg_train_losses = get_avg_training_losses(statistics)

        consecutive_epochs_with_no_improve = check_convergence(
            model, 
            model_path, 
            avg_train_losses, 
            curr_loss, 
            consecutive_epochs_with_no_improve)
        
        if consecutive_epochs_with_no_improve == 2:
          print("Stop training: The loss has not changed since 2 epochs!")
          break

    accuracy_scores = evaluate(model, validation_dataloader, device)
    statistics.append(results_to_dict(epoch, batch_losses, training_times, accuracy_scores))

Training Epoch [3/4]: 100%|██████████| 1909/1909 [36:22<00:00,  1.14s/it, loss_train=0.22, training_time=1091.37]


Model saved at ../backup/models/twitter-camembert.pt


Validation in progress: 100%|██████████| 478/478 [02:33<00:00,  3.11it/s, balanced_accuracy_score=1]
Training Epoch [4/4]: 100%|██████████| 1909/1909 [36:21<00:00,  1.14s/it, loss_train=0.38, training_time=1090.74]
Validation in progress: 100%|██████████| 478/478 [02:34<00:00,  3.10it/s, balanced_accuracy_score=1]


In [None]:
training_stats_path = "../backup/models/training-stats-camembert.json"

with open(training_stats_path, "w") as f:
    json.dump(statistics, f) 

In [56]:
for stat in statistics: 
    print(f"Epoch {stat['epoch']}")
    print(f"Average training loss = {round(np.mean(stat['train_batch_losses']), 3)}")
    print(f"Average validation accuracy = {round(np.mean(stat['val_batch_scores']), 3)}")
    print("-"*50)

Epoch 0
Average training loss = 0.444
Average validation accuracy = 0.818
--------------------------------------------------
Epoch 1
Average training loss = 0.371
Average validation accuracy = 0.823
--------------------------------------------------
Epoch 2
Average training loss = 0.346
Average validation accuracy = 0.823
--------------------------------------------------
Epoch 3
Average training loss = 0.347
Average validation accuracy = 0.823
--------------------------------------------------


### Train on whole data set

In [15]:
batch_size = 64

input_ids, attention_mask, sentiments = preprocess(tweets, tokenizer, sentiments=sentiments)

dataset = TensorDataset(
    input_ids,
    attention_mask,
    sentiments)

dataloader = DataLoader(
    dataset,
    sampler = RandomSampler(dataset),
    batch_size = batch_size)



In [16]:
model_path = "../backup/models/twitter-camembert.pt"
model = load_model(model_path)

Loading trained model...


In [17]:
# initialize a variable holding the device used for training ('cpu' or 'cuda')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"{device=}")
model = model.to(device)

device=device(type='cuda', index=0)


In [20]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = init_scheduler(num_epochs=1, dataloader=dataloader, optimizer=optimizer)



In [None]:
batch_losses, training_times = train(
    model, 
    dataloader, 
    device, 
    optimizer, 
    scheduler, 
    epoch=0, 
    num_epochs=1)

Training Epoch [1/1]:   2%|▏         | 45/2386 [01:02<55:50,  1.43s/it, loss_train=0.3, training_time=1.68e+9]

In [48]:
backup_model(model, model_path)

Model saved at ../backup/models/twitter-camembert.pt


## Evaluation on unseen data