<a href="https://colab.research.google.com/github/ValentinCord/HandsOnAI_2/blob/main/NLP_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [143]:
!/opt/bin/nvidia-smi
!rm -rf sample_data

Fri Dec 16 18:00:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    34W /  70W |   7700MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [144]:
!pip3 install transformers
!pip3 install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [145]:
# basics 
import os
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

# transformers 
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer
from transformers import CamembertModel, CamembertTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# plot 
import matplotlib.pyplot as plt 
import seaborn as sns 

# torch 
import torch
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# nltk 
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [146]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Lecture et analyse pandas

In [147]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 10
EPOCHS = 10
LEARNING_RATE = 1e-05

LEN_TEXT = 300
OVERLAP = 50

In [148]:
train_path = '/content/drive/MyDrive/HandOnAI_2_NLP/fake_train.csv'
added_path = '/content/drive/MyDrive/HandOnAI_2_NLP/added_train.csv'
test_path = '/content/drive/MyDrive/HandOnAI_2_NLP/fake_test.csv'

df = pd.read_csv(train_path)
df_added = pd.read_csv(added_path)
df_test = pd.read_csv(test_path)

# suppression des colonnes inutiles 
df = df.drop(['Unnamed: 0', 'target_name'], axis = 1)
df_added.rename(columns = {'french':'data'}, inplace = True)
df_added = df_added.drop(['Unnamed: 0'], axis = 1)
df_test = df_test.drop(['Unnamed: 0', 'target_name'], axis = 1)

In [149]:
len(df)

1458

In [150]:
len(df_added)

11150

In [151]:
#df = df.append(df_added[:2000], ignore_index=True)

In [152]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('french'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    #text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [153]:
df['data'] = df['data'].apply(clean_text)
df_test['data'] = df_test['data'].apply(clean_text)

In [154]:
df.head()

Unnamed: 0,data,label
0,22e jour consécutif grève reconductible contre...,0
1,depuis plusieurs mois initiatives chercheurs m...,0
2,google vient d'introduire mise jour applicatio...,0
3,portrait. chacun s’empresse autour d’elle tand...,0
4,« n’y a risque pénurie » carburant a déclaré j...,0


In [155]:
def get_split(text1):
    l_total = []
    l_parcial = []
    if len(text1.split())//(LEN_TEXT - OVERLAP) >0:
        n = len(text1.split())//(LEN_TEXT - OVERLAP)
    else: 
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = text1.split()[:LEN_TEXT]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = text1.split()[w*(LEN_TEXT - OVERLAP):w*(LEN_TEXT - OVERLAP) + LEN_TEXT]
            l_total.append(" ".join(l_parcial))
    return l_total

In [156]:
df['text_split'] = df['data'].apply(get_split)
df['len_split'] = df['text_split'].apply(lambda x: len(x))

df_test['text_split'] = df_test['data'].apply(get_split)
df_test['len_split'] = df_test['text_split'].apply(lambda x: len(x))

In [157]:
df['len_split'].value_counts()

1     1248
2      126
3       45
4       23
6        5
7        4
5        3
8        2
12       1
14       1
Name: len_split, dtype: int64

In [158]:
df_test['len_split'].value_counts()

1     418
2      43
3      11
5       5
4       5
6       2
7       1
11      1
Name: len_split, dtype: int64

In [159]:
# verification du fonctionnement

for index, row in df.iterrows():
  if len(row['text_split']) > 1: 
    print(index)
    break

print(df['text_split'][31][0].split()[(LEN_TEXT - OVERLAP):])
print(df['text_split'][31][1].split()[:OVERLAP])

12
['étudiants', "l'université", 'carleton', 'lors', "l'expédition", 'students', 'ice', 'antarctic', '2011.', "l'équipe", 'visite', 'îles', 'déception', 'seymour.', '©', 'musée', 'canadien', 'nature', 'mémoire', 'isotopes', 'calcium', 'menés', 'benjamin', 'linzmeier', 'andrew', 'd.', 'jacobson', "l'université", 'northwestern', 'chicago', 'chercheurs', 'basé', 'leurs', 'travaux', 'collecte', 'fossiles', 'retrouvés', 'formation', 'géologique', 'célèbre', 'trouve', "l'île", 'seymour', 'formation', 'lopez', 'bertodano', 'dont', 'strates', 'témoignent', 'période']
['étudiants', "l'université", 'carleton', 'lors', "l'expédition", 'students', 'ice', 'antarctic', '2011.', "l'équipe", 'visite', 'îles', 'déception', 'seymour.', '©', 'musée', 'canadien', 'nature', 'mémoire', 'isotopes', 'calcium', 'menés', 'benjamin', 'linzmeier', 'andrew', 'd.', 'jacobson', "l'université", 'northwestern', 'chicago', 'chercheurs', 'basé', 'leurs', 'travaux', 'collecte', 'fossiles', 'retrouvés', 'formation', 'géol

In [160]:
def create_df(df): 
  train_l = []
  label_l = []
  for idx,row in df.iterrows():
      for l in row['text_split']:
          train_l.append(l)
          label_l.append([1 if row['label'] == i else 0 for i in range(2)])

  return pd.DataFrame({'data':train_l, 'label':label_l})

In [161]:
cleaned_df = create_df(df)
cleaned_df_test = create_df(df_test)

In [162]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, is_target = True):
        self.tokenizer = tokenizer
        self.df = dataframe
        self.text = dataframe.data
        self.max_len = max_len
        if is_target: 
          self.targets = self.df.label
        else: 
          self.targets = None

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if self.targets is None: 
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
          }
        else: 
          return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
              'targets': torch.tensor(self.targets[index], dtype=torch.float)
          }

In [163]:
tranfo_name = "cmarkea/distilcamembert-base"
tokenizer = AutoTokenizer.from_pretrained(tranfo_name)

In [164]:
train_size = 0.8
train_dataset=cleaned_df.sample(frac=train_size,random_state=200)
test_dataset=cleaned_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(cleaned_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(cleaned_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(cleaned_df_test, tokenizer, MAX_LEN)

FULL Dataset: (1842, 2)
TRAIN Dataset: (1474, 2)
TEST Dataset: (368, 2)


In [165]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [166]:
class BERTClass(torch.nn.Module):
    def __init__(self):
      super(BERTClass, self).__init__()
      self.l1 = CamembertModel.from_pretrained(tranfo_name)
      self.l3 = torch.nn.Linear(768, 2) #2 = binary classification
    
    def forward(self, ids, mask, token_type_ids):
      output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
      output = self.l3(output_1['pooler_output'])

      return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at cmarkea/distilcamembert-base were not used when initializing CamembertModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream

BERTClass(
  (l1): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0): CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [167]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [168]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%10==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [169]:
for epoch in range(EPOCHS):
    train(epoch)



Epoch: 0, Loss:  0.6294869184494019
Epoch: 0, Loss:  0.9438809752464294
Epoch: 0, Loss:  0.6648010611534119
Epoch: 0, Loss:  0.4607350528240204
Epoch: 0, Loss:  0.2930905818939209
Epoch: 0, Loss:  0.38905957341194153
Epoch: 0, Loss:  0.10348131507635117
Epoch: 0, Loss:  0.3762352466583252
Epoch: 0, Loss:  0.45398011803627014
Epoch: 0, Loss:  0.08011548966169357
Epoch: 0, Loss:  0.5656986236572266
Epoch: 0, Loss:  0.28913888335227966
Epoch: 0, Loss:  0.10057342052459717
Epoch: 0, Loss:  0.11074121296405792
Epoch: 0, Loss:  0.0827561765909195
Epoch: 0, Loss:  0.0883408784866333
Epoch: 0, Loss:  0.08385002613067627
Epoch: 0, Loss:  0.04390465095639229
Epoch: 0, Loss:  0.06428507715463638
Epoch: 1, Loss:  0.03863358497619629
Epoch: 1, Loss:  0.13830403983592987
Epoch: 1, Loss:  0.04863704368472099
Epoch: 1, Loss:  0.033583205193281174
Epoch: 1, Loss:  0.050243403762578964
Epoch: 1, Loss:  0.03810238093137741
Epoch: 1, Loss:  0.02445991151034832
Epoch: 1, Loss:  0.3660668432712555
Epoch: 1,

In [170]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
      for _, data in enumerate(testing_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        fin_targets.extend(targets.cpu().detach().numpy().tolist())

        m = torch.nn.Softmax(dim=1)
        fin_outputs.extend(torch.round(m(outputs)).cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets

In [171]:
from sklearn import metrics
outputs, targets = validation()
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9722222222222222
F1 Score (Micro) = 0.9722222222222222
F1 Score (Macro) = 0.9690112737724694


In [172]:
cleaned_df_test['pred'] = outputs
cleaned_df_test.head()

Unnamed: 0,data,label,pred
0,président groupe lrem a pris toutes pincettes....,"[1, 0]","[1.0, 0.0]"
1,villes françaises qualité l'air meilleure moin...,"[1, 0]","[1.0, 0.0]"
2,cop25 vient s'achever laisse goût amer certain...,"[1, 0]","[1.0, 0.0]"
3,action network . cop25 occasion « ratée » répo...,"[1, 0]","[1.0, 0.0]"
4,2020. évidemment états-unis quitteront l'accor...,"[1, 0]","[1.0, 0.0]"


In [173]:
cleaned_df_test.loc[0]['pred']

[1.0, 0.0]

In [174]:
pos = 0

df_test['pred'] = [list() for x in range(len(df_test.index))]

for idx,row in df_test.iterrows():
  for i in range(row['len_split']): 
    row['pred'].append(cleaned_df_test.loc[pos]['pred'])
    pos += 1

In [175]:
for idx, row in df_test.iterrows(): 
  if row['len_split'] > 1: 
    print(row['pred'], row['label'])

[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] 0
[[1.0, 

In [176]:
checkpoint = {'model': BERTClass(),
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

Some weights of the model checkpoint at cmarkea/distilcamembert-base were not used when initializing CamembertModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertModel were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream

In [177]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = checkpoint['model']
    model.load_state_dict(checkpoint['state_dict'])
    for parameter in model.parameters():
        parameter.requires_grad = False
    
    model.eval()
    return model

In [178]:
the_model = load_checkpoint('checkpoint.pth')

In [179]:
news = "hello there"

d = {'data': [news]}
df = pd.DataFrame(data=d)

test_set = CustomDataset(df, tokenizer, MAX_LEN, is_target = False)

In [180]:
_params = {'batch_size': 1,
                'shuffle': True,
                'num_workers': 0
                }

_loader = DataLoader(test_set, **_params)

In [181]:
the_model.eval()
the_model.to(device)
for _,data in enumerate(_loader, 0):
  ids = data['ids'].to(device)
  mask = data['mask'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  
  outputs = the_model(ids, mask, token_type_ids)
  _, predicted = torch.max(outputs.data, 1)

  print(predicted)

tensor([1], device='cuda:0')


