<a href="https://colab.research.google.com/github/ValentinCord/HandsOnAI_2/blob/main/NLP_Transformer_evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <span> NLP : Transformer prédiction du modèle </span>
<hr style="border-bottom: solid;background-color:light;color:black;">

<a id="section-1"></a>
# <span>1. Installation des packages</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [45]:
!/opt/bin/nvidia-smi
!rm -rf sample_data

!pip3 install transformers
!pip3 install datasets
!pip install sentencepiece

Wed Dec 28 11:18:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    33W /  70W |   2414MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# <span>2. Imports </span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [46]:
# basics 
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn import metrics

# transformers 
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from transformers import CamembertModel, CamembertTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# plot 
import matplotlib.pyplot as plt 
import seaborn as sns 

# torch 
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F

# nltk 
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<a id="section-3"></a>
# <span>3. Choix des paramètres</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [48]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 10
EPOCHS = 1
LEARNING_RATE = 1e-05

LEN_TEXT = 150
OVERLAP = 50

TRANSFORMER_NAME = "cmarkea/distilcamembert-base"

model_path = "/content/drive/MyDrive/HandOnAI_2_NLP/transformer_model.pth"
test_path = '/content/drive/MyDrive/HandOnAI_2_NLP/fake_test.csv'

<a id="section-4"></a>
# <span>4. Lecture des données</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [49]:
df_test = pd.read_csv(test_path)
df_test = df_test.drop(['Unnamed: 0', 'target_name'], axis = 1)

<a id="section-5"></a>
# <span>5. Preprocessing</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

## <span>5.1 Nettoyage des données</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [50]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
STOPWORDS = set(stopwords.words('french'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [51]:
df_test['data'] = df_test['data'].apply(clean_text)

## <span>5.2 Découpage des données</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [52]:
def get_split(text1):
    l_total = []
    l_parcial = []
    if len(text1.split())//(LEN_TEXT - OVERLAP) >0:
        n = len(text1.split())//(LEN_TEXT - OVERLAP)
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text1.split()[:LEN_TEXT]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text1.split()[w*(LEN_TEXT - OVERLAP):w*(LEN_TEXT - OVERLAP) + LEN_TEXT]
            l_total.append(" ".join(l_parcial))
    return l_total

In [53]:
df_test['text_split'] = df_test['data'].apply(get_split)
df_test['len_split'] = df_test['text_split'].apply(lambda x: len(x))

## <span>5.3 Reformulation du labels</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [54]:
def create_df(df): 
  train_l = []
  label_l = []
  for idx,row in df.iterrows():
      for l in row['text_split']:
          train_l.append(l)
          label_l.append([1 if row['label'] == i else 0 for i in range(2)])

  return pd.DataFrame({'data':train_l, 'label':label_l})

In [55]:
cleaned_df_test = create_df(df_test)

## <span>5.4 Création du dataset</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [56]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_target = True):
        self.tokenizer = tokenizer
        self.df = dataframe
        self.text = dataframe.data
        self.max_len = max_len
        if is_target: 
          self.targets = self.df.label
        else: 
          self.targets = None

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if self.targets is None: 
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
          }
        else: 
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
              'targets': torch.tensor(self.targets[index], dtype=torch.float)
          }

In [57]:
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_NAME)
testing_set = CustomDataset(cleaned_df_test, tokenizer, MAX_LEN)

## <span>5.5 Création du dataloader</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [58]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

testing_loader = DataLoader(testing_set, **test_params)

<a id="section-6"></a>
# <span>6. Chargement du modèle</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [59]:
class BERTClass(torch.nn.Module):
    def __init__(self):
      super(BERTClass, self).__init__()
      self.l1 = CamembertModel.from_pretrained(TRANSFORMER_NAME)
      self.l3 = torch.nn.Linear(768, 2) #2 = binary classification
    
    def forward(self, ids, mask, token_type_ids):
      output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
      output = self.l3(output_1['pooler_output'])

      return F.softmax(output, dim=1)

In [60]:
def load_checkpoint(filepath):
    #checkpoint = torch.load(filepath, map_location=torch.device('cpu'))
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    
    model.eval()
    model.to(device)
    return model

In [61]:
model = load_checkpoint(model_path)

<a id="section-7"></a>
# <span>7. Prédiction du modèle avec les données splité</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [62]:
def validation():
    ce_loss = torch.nn.CrossEntropyLoss()
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    correct_predictions = 0
    total_instances = 0
    total_loss = 0

    with torch.no_grad():
      for count, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        fin_targets.extend(targets.cpu().detach().numpy().tolist())

        # accuracy 
        classifications = torch.argmax(outputs, dim=1)
        labels = torch.argmax(targets, dim=1)
        correct_predictions += sum(classifications==labels).item()
        total_instances += len(outputs)

        # loss 
        total_loss += ce_loss(outputs, labels)

        fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    accuracy = correct_predictions/total_instances
    loss = total_loss/total_instances

    print(f"Accuracy Score = {accuracy}")
    print(f"Loss Score = {loss}")

    return fin_outputs, fin_targets

In [63]:
outputs, targets = validation()



Accuracy Score = 0.96
Loss Score = 0.03531963378190994


<a id="section-8"></a>
# <span>8. Prédiction du modèle avec les données d'origine</span>
<hr style="border-bottom: solid;background-color:light;color:black;">

In [64]:
cleaned_df_test['pred'] = outputs
pos = 0
df_test['pred'] = [list() for x in range(len(df_test.index))]
for idx,row in df_test.iterrows():
  for i in range(row['len_split']): 
    row['pred'].append(cleaned_df_test.loc[pos]['pred'])
    pos += 1

In [65]:
df_test['prediction'] = df_test['pred'].apply(lambda x: [1, 0] if np.argmax(np.sum(x, axis = 0)) == 0 else [0, 1])
df_test['label_pred'] = df_test['pred'].apply(lambda x: np.argmax(np.sum(x, axis = 0)))

In [66]:
accuracy = metrics.accuracy_score(df_test['label_pred'], df_test['label'])
print(f"Accuracy Score = {accuracy}")

Accuracy Score = 0.948559670781893
