# Modélisation

### Librairies

In [None]:
import numpy as np
import torch
import torch.optim as optim
from gensim.models import KeyedVectors
from torch.utils.tensorboard import SummaryWriter
from donnees.nettoyage import FakeNews_Task3_2022_V0
from donnees.utils import FakeNewsDataset, ajuster_canaux
from embedding import GloVeModel, tokeniser
from modeles import CNNRNN, train_seq_fix, evaluation

## Données

In [None]:
# Importer et nettoyer les donnees
data_train = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_training.csv","train")
data_dev = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_dev.csv","dev")
data_test = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_Test/English_data_test_release_with_rating.csv","test")
print(f"Entrainement : {data_train.shape[0]} | Validation : {data_dev.shape[0]} | Test : {data_test.shape[0]}")

In [None]:
# Ajuster les variables
data_train["full_text"] = data_train.title+" "+data_train.text
data_train["label$true"] = np.where(data_train.our_rating=="true",1,0)
data_train["label$false"] = np.where(data_train.our_rating.str.contains("false"),1,0)
data_train["label"] = np.select([data_train.our_rating.str.contains("false"),data_train.our_rating=="true"],[0,1],2)
data_dev["full_text"] = data_dev.title+" "+data_dev.text
data_dev["label$true"] = np.where(data_dev.our_rating=="true",1,0)
data_dev["label$false"] = np.where(data_dev.our_rating.str.contains("false"),1,0)
data_dev["label"] = np.select([data_dev.our_rating.str.contains("false"),data_dev.our_rating=="true"],[0,1],2)
data_test["full_text"] = data_test.title+" "+data_test.text
data_test["label$true"] = np.where(data_test.our_rating=="true",1,0)
data_test["label$false"] = np.where(data_test.our_rating.str.contains("false"),1,0)
data_test["label"] = np.select([data_test.our_rating.str.contains("false"),data_test.our_rating=="true"],[0,1],2)

In [None]:
# Embedding
# Choisir glove ou word2Vec
glove = GloVeModel("./donnees/glove.6B/glove.6B.100d.txt")
# word2Vec = KeyedVectors.load_word2vec_format("./donnees/GoogleNews-vectors-negative300.bin.gz",binary=True)

In [None]:
# Jetoniser les donnees d'entrainement
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_train = tokeniser(data_train.full_text,modele=glove,pad=True)
if isinstance(tokens_train,torch.Tensor):
    print(tokens_train.shape)
else:
    print(len(tokens_train))

In [None]:
# Cibles d'entrainement
cible_train_real = torch.tensor(data_train["label$true"],dtype=torch.float32)
cible_train_fake = torch.tensor(data_train["label$false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de validation
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_dev = tokeniser(data_dev.full_text,modele=glove,pad=True)
if isinstance(tokens_dev,torch.Tensor):
    print(tokens_dev.shape)
else:
    print(len(tokens_dev))

In [None]:
# Cibles de validation
cible_dev_real = torch.tensor(data_dev["label$true"],dtype=torch.float32)
cible_dev_fake = torch.tensor(data_dev["label$false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de test
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_test = tokeniser(data_test.full_text,modele=glove,pad=True)
if isinstance(tokens_test,torch.Tensor):
    print(tokens_test.shape)
else:
    print(len(tokens_test))

In [None]:
# Cibles pour l'evaluation
cible_train_dev_real = torch.cat((cible_train_real,cible_dev_real))
cible_test_real = torch.tensor(data_test["label$true"],dtype=torch.int)
cible_train_dev_fake = torch.cat((cible_train_fake,cible_dev_fake))
cible_test_fake = torch.tensor(data_test["label$false"],dtype=torch.int)
cible_train_dev = torch.cat((torch.tensor(data_train["label"]),torch.tensor(data_dev["label"])))
cible_test = torch.tensor(data_test["label"])

In [None]:
# Creer datasets pour les modeles 
# Seulement pour sequences de longueur fixe
max_mots = max(tokens_train.shape[1],tokens_dev.shape[1],tokens_test.shape[1])   # ATTENTION! Peut etre tres eleve, peut changer pour une valeur au choix
max_mots = 300
dataset_train_real = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_real)
dataset_dev_real = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_real)
dataset_test_real = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_real)
dataset_train_fake = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_fake)
dataset_dev_fake = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_fake)
dataset_test_fake = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_fake)
max_mots

## Hybride CNN-RNN 
Pour séquences de texte de longueur fixe.  
Source : https://www.sciencedirect.com/science/article/pii/S2667096820300070

### Prédire "real" news

In [None]:
writer = SummaryWriter('runs/modelisation/cnn_rnn/real')

In [None]:
# Initialiser le modele
hybrid_real = CNNRNN(input_size=100,in_channels=300,out_channels=128,kernel_size=5,hidden_size=32,device="mps")
optimizer = optim.Adam(hybrid_real.parameters(),lr=1e-4)
hybrid_real

In [None]:
writer.add_graph(hybrid_real,dataset_train_real.X.to("mps"))
writer.flush()

In [None]:
# Entrainer le modele
train_seq_fix(hybrid_real,optimizer,max_epochs=5,Xy_train=dataset_train_real,Xy_val=dataset_dev_real,taille_batch=1,melanger=True,device="mps",writer=writer,verbose=1)
writer.flush()

In [None]:
# Generer des predictions
pred_train_dev_real = torch.cat((hybrid_real.predict(dataset_train_real.X),hybrid_real.predict(dataset_dev_real.X)))
pred_test_real = hybrid_real.predict(dataset_test_real.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_real,pred_train_dev_real,"entrainement + dev")
evaluation(cible_test_real,pred_test_real,"test")

In [None]:
writer.close()

### Prédire "fake" news

In [None]:
writer = SummaryWriter('runs/modelisation/cnn_rnn/fake')

In [None]:
# Initialiser le modele
hybrid_fake = CNNRNN(input_size=100,in_channels=300,out_channels=128,kernel_size=5,hidden_size=32,device="mps")
optimizer = optim.Adam(hybrid_fake.parameters(),lr=1e-4)
hybrid_fake

In [None]:
writer.add_graph(hybrid_fake,dataset_train_fake.X.to("mps"))
writer.flush()

In [None]:
# Entrainer le modele
train_seq_fix(hybrid_fake,optimizer,max_epochs=5,Xy_train=dataset_train_fake,Xy_val=dataset_dev_fake,taille_batch=1,melanger=True,device="mps",writer=writer,verbose=1)
writer.flush()

In [None]:
# Generer des predictions
pred_train_dev_fake = torch.cat((hybrid_fake.predict(dataset_train_fake.X),hybrid_fake.predict(dataset_dev_fake.X)))
pred_test_fake = hybrid_fake.predict(dataset_test_fake.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_fake,pred_train_dev_fake,"entrainement + dev")
evaluation(cible_test_fake,pred_test_fake,"test")

In [None]:
writer.close()

### Combinaison

In [None]:
# Combiner les predictions
pred_train_dev = np.select([(pred_train_dev_real==0)*(pred_train_dev_fake==1),(pred_train_dev_real==1)*(pred_train_dev_fake==0)],[0,1],2)
pred_test = np.select([(pred_test_real==0)*(pred_test_fake==1),(pred_test_real==1)*(pred_test_fake==0)],[0,1],2)

In [None]:
# Evaluer le modele combine
evaluation(cible_train_dev,pred_train_dev,"entrainement + dev",multi=True)
evaluation(cible_test,pred_test,"entrainement + dev",multi=True)