# **Exemple de pré-traitement des données**

In [None]:
import torch
from gensim.models import KeyedVectors
from donnees.nettoyage import load_dataset, clean_dataset, add_columns
from donnees.utils import FakeNewsDataset, ajuster_canaux
from embedding import GloVeModel, tokeniser

## *Nettoyage*

Ne pas oublier d'importer les données.

In [None]:
# Entrainement
data_train = load_dataset("./donnees/Task3_english_training.csv")
data_train = clean_dataset(data_train)
data_train = add_columns(data_train)
# Validation
data_dev = load_dataset("./donnees/Task3_english_dev.csv")
data_dev = clean_dataset(data_dev)
data_dev = add_columns(data_dev)
# Test
data_test = load_dataset("./donnees/English_data_test_release_with_rating.csv")
data_test = clean_dataset(data_test)
data_test = add_columns(data_test)
print(f"Entrainement : {data_train.shape[0]} | Validation : {data_dev.shape[0]} | Test : {data_test.shape[0]}")

## *Embedding*

Ne pas oublier d'importer les embeddings pré-entrainés.

In [None]:
# Choisir glove ou word2Vec
glove = GloVeModel("./donnees/glove.6B/glove.6B.100d.txt")
# word2Vec = KeyedVectors.load_word2vec_format("./donnees/GoogleNews-vectors-negative300.bin.gz",binary=True)

In [None]:
# Jetoniser les donnees d'entrainement
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_train = tokeniser(data_train.full_text, modele=glove, pad=False)
if isinstance(tokens_train,torch.Tensor):
    print(tokens_train.shape)
else:
    print(len(tokens_train))

In [None]:
# Cibles d'entrainement
cible_train_real = torch.tensor(data_train["true"],dtype=torch.float32)
# cible_train_fake = torch.tensor(data_train["false"],dtype=torch.float32)
# cible_train_part = torch.tensor(data_train["partially_false"],dtype=torch.float32)
# cible_train_oth = torch.tensor(data_train["other"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de validation
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_dev = tokeniser(data_dev.full_text, modele=glove, pad=False)
if isinstance(tokens_dev,torch.Tensor):
    print(tokens_dev.shape)
else:
    print(len(tokens_dev))

In [None]:
# Cibles de validation
cible_dev_real = torch.tensor(data_dev["true"],dtype=torch.float32)
# cible_dev_fake = torch.tensor(data_dev["false"],dtype=torch.float32)
# cible_dev_part = torch.tensor(data_dev["partially_false"],dtype=torch.float32)
# cible_dev_oth = torch.tensor(data_dev["other"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de test
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_test = tokeniser(data_test.full_text,modele=glove,pad=False)
if isinstance(tokens_test,torch.Tensor):
    print(tokens_test.shape)
else:
    print(len(tokens_test))

In [None]:
# Cibles pour l'evaluation
cible_train_dev_real = torch.cat((cible_train_real,cible_dev_real))
cible_test_real = torch.tensor(data_test["true"],dtype=torch.int)
# cible_train_dev_fake = torch.cat((cible_train_fake,cible_dev_fake))
# cible_test_fake = torch.tensor(data_test["false"],dtype=torch.int)
# cible_train_dev_part = torch.cat((cible_train_part,cible_dev_part))
# cible_test_part = torch.tensor(data_test["partially_false"],dtype=torch.int)
# cible_train_dev_oth = torch.cat((cible_train_oth,cible_dev_oth))
# cible_test_oth = torch.tensor(data_test["other"],dtype=torch.int)

In [None]:
# Creer datasets pour les modeles
# Seulement pour sequences de longueur fixe i.e. pour tokens en tenseurs (n_phrases, max_mots, n_emb)
max_mots = max(tokens_train.shape[1],tokens_dev.shape[1],tokens_test.shape[1])   # ATTENTION! Peut etre tres eleve, peut changer pour une valeur au choix
dataset_train_real = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_real)
dataset_dev_real = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_real)
dataset_test_real = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_real)
# dataset_train_fake = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_fake)
# dataset_dev_fake = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_fake)
# dataset_test_fake = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_fake)
# dataset_train_part = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_part)
# dataset_dev_part = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_part)
# dataset_test_part = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_part)
# dataset_train_oth = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_oth)
# dataset_dev_oth = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_oth)
# dataset_test_oth = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_oth)
max_mots