In [63]:
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
import sklearn.model_selection
from sklearn import svm
import sklearn.metrics
from sklearn.model_selection import train_test_split

class MonDataset(Dataset):
    "Création d'un dataset personnalisé avec PyTorch"

    def __init__(self, phrases, labels, transformations=None):
        "Initialisation"

        self.phrases = phrases
        self.labels = labels
        self.len = len(phrases)

        self.transform = transformations # Si jamais tu veux faire des transformations sur tes phrases ou tes labels, tu stock la fonction ici

    def __len__(self):
        'Retourne le nombre de items dans ton dataset'
        return self.len

    def __getitem__(self, i):
        "Fonction qui permet de récupérer un item de ton dataset, ex: une phrase"

        # Load data and get label
        X = self.phrases[i]
        y = self.labels[i]

        if self.transform:
            X = self.transform(X) # par exemple, lemmatiser la phrase ou autre

        return X, y

df = pd.read_csv("POS1.csv", names = ['message','label'], header=1)
df.fillna('',inplace=True)
#df.drop([692,1238],inplace=True)
max_len = 0
for i,sent in enumerate(df["message"]):
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    if len(input_ids) > 512:
        print("annoying review at", i,"with length",
              len(input_ids))
        df.drop([i],inplace=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))


tokenizer = transformers.CamembertTokenizer.from_pretrained("camembert/camembert-large")
model = transformers.CamembertModel.from_pretrained("camembert/camembert-large")

dataset = MonDataset(df['message'].to_list(), df['label'].to_list())
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

print("Camembert loaded")

annoying review at 692 with length 1104
annoying review at 1117 with length 888
annoying review at 1238 with length 660
Camembert loaded


In [64]:
print("batch process")
outputNumpy = []
yBatch = []
i=0

for batch in dataloader:
    tokens_ids = [tokenizer.encode(sentence) for sentence in batch[0]]
    tokens_ids_unsq = [torch.tensor(token_id).unsqueeze(0) for token_id in tokens_ids]
    outputs = [model(token)["last_hidden_state"][:,0] for token in tokens_ids_unsq]
    outputNumpy += [output.squeeze(0).detach().numpy() for output in outputs]
    yBatch += batch[1]
    print(i)
    i += 16

batch process
0
16
32
48
64
80
96
112
128
144
160
176
192
208
224
240
256
272
288
304
320
336
352
368
384
400
416
432
448
464
480
496
512
528
544
560
576
592
608
624
640
656
672
688
704
720
736
752
768
784
800
816
832
848
864
880
896
912
928
944
960
976
992
1008
1024
1040
1056
1072
1088
1104
1120
1136
1152
1168
1184
1200
1216
1232
1248
1264
1280
1296
1312


In [65]:
X = pd.DataFrame(outputNumpy)
y = yBatch

In [66]:
cv = sklearn.model_selection.ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
clf = svm.SVC(kernel='poly', C=1, random_state=42)
results = cross_val_score(clf, X, y, cv=cv, scoring="f1")
print("F-score de la validation croisée : ", results)
print("Moyenne des F-score", np.mean(results))

F-score de la validation croisée :  [0.85869565 0.88619855 0.83673469 0.859375   0.84634761 0.85
 0.82849604 0.83412322 0.85514019 0.80965147]
Moyenne des F-score 0.8464762426731749
