In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports des bibliothèques

In [1]:
import numpy as np
import pandas as pd
import torch

# Imports des données

In [2]:
data_url = '/content/data_fusion_sentiment.csv'
data = pd.read_csv(data_url)

# Récupérer 10 lignes aléatoires
data_test = data.sample(10)

In [3]:
data_test.shape

(10, 20)

In [4]:
data_test.columns

Index(['Titre', 'Description', 'Auteurs', 'Image', 'Lien Google', 'Editeur',
       'Date publication', 'infoLink', 'Genre', 'Nb scores', 'Id', 'Prix',
       'User_id', 'Nom lecteur', 'revue/utilité', 'revue/score', 'revue/heure',
       'revue/résumé', 'revue/texte', 'Sentiment'],
      dtype='object')

# Prédictions

## Prédictions de sentiments

## Prédictions de notes

### Prédictions BERT

#### Fonctions et setup

In [5]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [7]:
# Fonction de prédictions pour BERT
def predict_review(review, model, tokenizer, device):
    model.eval()
    encoded = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    input_id = torch.tensor([encoded["input_ids"]]).to(device)
    attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_id, attention_mask=attention_mask)
    logits = outputs.logits.detach().cpu().numpy()
    predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score pour correspondre à l'échelle originale
    return predicted_score

In [8]:
# Charger le modèle et le tokenizer
model_path = "/content/drive/MyDrive/SAE S6/model_A100_sample"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-11): 12 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
# Prédire les scores des 10 lignes aléatoires
data_test["predicted_score"] = data_test["revue/texte"].apply(lambda x: predict_review(x, model, tokenizer, device))

In [12]:
# Afficher les résultats
data_test[["revue/texte", "revue/score", "predicted_score"]]

Unnamed: 0,revue/texte,revue/score,predicted_score
3216,"Two lifeships crash on an uninhabited planet, ...",3.0,3
176775,Not the strongest of Lucchesi's books - I expe...,3.0,3
44528,I found this at a used bookstore and it looked...,2.0,2
216013,I love this book! Can't say enough good things...,5.0,5
200297,"Laughed at my hat, worry about clothes, Wells,...",5.0,5
127395,As a latino who has enjoyed occasional plays o...,2.0,2
104625,"Very good read scientfic info, by krantz ,dmit...",5.0,4
183962,"Ever since I read him on Genesis, I have sough...",5.0,5
4208,O'Connor is at his best here -- one of those f...,5.0,5
118749,This is one of the most unique books I have ev...,5.0,5
