# Baseline

### Librairies

In [None]:
import numpy as np
import torch
import torch.optim as optim
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from donnees.nettoyage import FakeNews_Task3_2022_V0
from donnees.utils import FakeNewsDataset, ajuster_canaux
from embedding import GloVeModel, tokeniser
from modeles import baseLSTM, baseBiLSTM, train_seq_var, baseCNN, train_seq_fix, evaluation

## Données

In [None]:
# Importer et nettoyer les donnees
data_train = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_training.csv","train")
data_dev = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_dev.csv","dev")
data_test = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_Test/English_data_test_release_with_rating.csv","test")
print(f"Entrainement : {data_train.shape[0]} | Validation : {data_dev.shape[0]} | Test : {data_test.shape[0]}")

In [None]:
# Ajuster les variables
data_train["full_text"] = data_train.title+" "+data_train.text
data_train["label$true"] = np.where(data_train.our_rating=="true",1,0)
data_train["label$false"] = np.where(data_train.our_rating.str.contains("false"),1,0)
data_train["label"] = np.select([data_train.our_rating.str.contains("false"),data_train.our_rating=="true"],[0,1],2)
data_dev["full_text"] = data_dev.title+" "+data_dev.text
data_dev["label$true"] = np.where(data_dev.our_rating=="true",1,0)
data_dev["label$false"] = np.where(data_dev.our_rating.str.contains("false"),1,0)
data_dev["label"] = np.select([data_dev.our_rating.str.contains("false"),data_dev.our_rating=="true"],[0,1],2)
data_test["full_text"] = data_test.title+" "+data_test.text
data_test["label$true"] = np.where(data_test.our_rating=="true",1,0)
data_test["label$false"] = np.where(data_test.our_rating.str.contains("false"),1,0)
data_test["label"] = np.select([data_test.our_rating.str.contains("false"),data_test.our_rating=="true"],[0,1],2)

In [None]:
# Embedding
# Choisir glove ou word2Vec
glove = GloVeModel("./donnees/glove.6B/glove.6B.100d.txt")
# word2Vec = KeyedVectors.load_word2vec_format("./donnees/GoogleNews-vectors-negative300.bin.gz",binary=True)

In [None]:
# Jetoniser les donnees d'entrainement
tokens_train = tokeniser(data_train.full_text,modele=glove,pad=True)
if isinstance(tokens_train,torch.Tensor):
    print(tokens_train.shape)
else:
    print(len(tokens_train))

In [None]:
# Cibles d'entrainement
cible_train_real = torch.tensor(data_train["label$true"],dtype=torch.float32)
cible_train_fake = torch.tensor(data_train["label$false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de validation
tokens_dev = tokeniser(data_dev.full_text,modele=glove,pad=True)
if isinstance(tokens_dev,torch.Tensor):
    print(tokens_dev.shape)
else:
    print(len(tokens_dev))

In [None]:
# Cibles de validation
cible_dev_real = torch.tensor(data_dev["label$true"],dtype=torch.float32)
cible_dev_fake = torch.tensor(data_dev["label$false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de test
tokens_test = tokeniser(data_test.full_text,modele=glove,pad=True)
if isinstance(tokens_test,torch.Tensor):
    print(tokens_test.shape)
else:
    print(len(tokens_test))

In [None]:
# Cibles pour l'evaluation
cible_train_dev_real = torch.cat((cible_train_real,cible_dev_real))
cible_test_real = torch.tensor(data_test["label$true"],dtype=torch.int)
cible_train_dev_fake = torch.cat((cible_train_fake,cible_dev_fake))
cible_test_fake = torch.tensor(data_test["label$false"],dtype=torch.int)
cible_train_dev = torch.cat((torch.tensor(data_train["label"]),torch.tensor(data_dev["label"])))
cible_test = torch.tensor(data_test["label"])

In [None]:
# Creer datasets pour les modeles (seulement pour sequences de longueur fixe)
max_mots = max(tokens_train.shape[1],tokens_dev.shape[1],tokens_test.shape[1])   # ATTENTION! Peut etre tres eleve, peut changer pour une valeur au choix
dataset_train_real = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_real)
dataset_dev_real = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_real)
dataset_test_real = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_real)
dataset_train_fake = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_fake)
dataset_dev_fake = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_fake)
dataset_test_fake = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_fake)
max_mots

## LSTM-RNN 
Pour séquences de texte de longueur variable ou fixe.

### Prédire "real" news

In [None]:
# Initialiser le modele
lstm_real = baseLSTM(input_size=100,hidden_size=100,seq="var",device="mps")
optimizer = optim.Adam(lstm_real.parameters(),lr=1e-4)
lstm_real

In [None]:
# Entrainer le modele
train_seq_var(lstm_real,optimizer,max_epochs=10,X_train=tokens_train,y_train=cible_train_real,X_val=tokens_dev,y_val=cible_dev_real,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_real = torch.cat((lstm_real.predict(tokens_train),lstm_real.predict(tokens_dev)))
pred_test_real = lstm_real.predict(tokens_test)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_real,pred_train_dev_real,"entrainement + dev")
evaluation(cible_test_real,pred_test_real,"test")

### Prédire "fake" news

In [None]:
# Initialiser le modele
lstm_fake = baseLSTM(input_size=100,hidden_size=100,seq="var",device="mps")
optimizer = optim.Adam(lstm_fake.parameters(),lr=1e-4)
lstm_fake

In [None]:
# Entrainer le modele
train_seq_var(lstm_fake,optimizer,max_epochs=10,X_train=tokens_train,y_train=cible_train_fake,X_val=tokens_dev,y_val=cible_dev_fake,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_fake = torch.cat((lstm_fake.predict(tokens_train),lstm_fake.predict(tokens_dev)))
pred_test_fake = lstm_fake.predict(tokens_test)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_fake,pred_train_dev_fake,"entrainement + dev")
evaluation(cible_test_fake,pred_test_fake,"test")

### Combinaison

In [None]:
# Combiner les predictions
pred_train_dev = np.select([(pred_train_dev_real==0)*(pred_train_dev_fake==1),(pred_train_dev_real==1)*(pred_train_dev_fake==0)],[0,1],2)
pred_test = np.select([(pred_test_real==0)*(pred_test_fake==1),(pred_test_real==1)*(pred_test_fake==0)],[0,1],2)

In [None]:
# Evaluer le modele combine
print("Justesse entrainement + dev : {:.2f}%".format(100*accuracy_score(cible_train_dev,pred_train_dev)))
print("Justesse test : {:.2f}%".format(100*accuracy_score(cible_test,pred_test)))
ConfusionMatrixDisplay.from_predictions(cible_train_dev,pred_train_dev,normalize=None)
plt.title(f"Matrice de confusion - Données entrainement + dev")
plt.show()
plt.close()
ConfusionMatrixDisplay.from_predictions(cible_test,pred_test,normalize=None)
plt.title(f"Matrice de confusion - Données test")
plt.show()

## Bi-LSTM-RNN 
Pour séquences de texte de longueur variable ou fixe.

### Prédire "real" news

In [None]:
# Initialiser le modele
bilstm_real = baseBiLSTM(input_size=100,hidden_size=100,seq="var",device="mps")
optimizer = optim.Adam(bilstm_real.parameters(),lr=1e-4)
bilstm_real

In [None]:
# Entrainer le modele
train_seq_var(bilstm_real,optimizer,max_epochs=10,X_train=tokens_train,y_train=cible_train_real,X_val=tokens_dev,y_val=cible_dev_real,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_real = torch.cat((bilstm_real.predict(tokens_train),bilstm_real.predict(tokens_dev)))
pred_test_real = bilstm_real.predict(tokens_test)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_real,pred_train_dev_real,"entrainement + dev")
evaluation(cible_test_real,pred_test_real,"test")

### Prédire "fake" news

In [None]:
# Initialiser le modele
bilstm_fake = baseBiLSTM(input_size=100,hidden_size=100,seq="var",device="mps")
optimizer = optim.Adam(bilstm_fake.parameters(),lr=1e-4)
bilstm_fake

In [None]:
# Entrainer le modele
train_seq_var(bilstm_fake,optimizer,max_epochs=10,X_train=tokens_train,y_train=cible_train_fake,X_val=tokens_dev,y_val=cible_dev_fake,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_fake = torch.cat((bilstm_fake.predict(tokens_train),bilstm_fake.predict(tokens_dev)))
pred_test_fake = bilstm_fake.predict(tokens_test)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_fake,pred_train_dev_fake,"entrainement + dev")
evaluation(cible_test_fake,pred_test_fake,"test")

### Combinaison

In [None]:
# Combiner les predictions
pred_train_dev = np.select([(pred_train_dev_real==0)*(pred_train_dev_fake==1),(pred_train_dev_real==1)*(pred_train_dev_fake==0)],[0,1],2)
pred_test = np.select([(pred_test_real==0)*(pred_test_fake==1),(pred_test_real==1)*(pred_test_fake==0)],[0,1],2)

In [None]:
# Evaluer le modele combine
print("Justesse entrainement + dev : {:.2f}%".format(100*accuracy_score(cible_train_dev,pred_train_dev)))
print("Justesse test : {:.2f}%".format(100*accuracy_score(cible_test,pred_test)))
ConfusionMatrixDisplay.from_predictions(cible_train_dev,pred_train_dev,normalize=None)
plt.title(f"Matrice de confusion - Données entrainement + dev")
plt.show()
plt.close()
ConfusionMatrixDisplay.from_predictions(cible_test,pred_test,normalize=None)
plt.title(f"Matrice de confusion - Données test")
plt.show()

## CNN
Pour séquences de texte de longueur fixe.

### Prédire "real" news

In [None]:
# Initialiser le modele
cnn_real = baseCNN(input_size=100,in_channels=max_mots,out_channels=128,kernel_size=5,device="mps")
optimizer = optim.Adam(cnn_real.parameters(),lr=1e-4)
cnn_real

In [None]:
# Entrainer le modele
train_seq_fix(cnn_real,optimizer,max_epochs=10,Xy_train=dataset_train_real,Xy_val=dataset_dev_real,taille_batch=1,melanger=True,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_real = torch.cat((cnn_real.predict(dataset_train_real.X),cnn_real.predict(dataset_dev_real.X)))
pred_test_real = cnn_real.predict(dataset_test_real.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_real,pred_train_dev_real,"entrainement + dev")
evaluation(cible_test_real,pred_test_real,"test")

### Prédire "fake" news

In [None]:
# Initialiser le modele
cnn_fake = baseCNN(input_size=100,in_channels=max_mots,out_channels=128,kernel_size=5,device="mps")
optimizer = optim.Adam(cnn_fake.parameters(),lr=1e-4)
cnn_fake

In [None]:
# Entrainer le modele
train_seq_fix(cnn_fake,optimizer,max_epochs=10,Xy_train=dataset_train_fake,Xy_val=dataset_dev_fake,taille_batch=1,melanger=True,device="mps",verbose=1)

In [None]:
# Generer des predictions
pred_train_dev_fake = torch.cat((cnn_fake.predict(dataset_train_fake.X),cnn_fake.predict(dataset_dev_fake.X)))
pred_test_fake = cnn_fake.predict(dataset_test_fake.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_dev_fake,pred_train_dev_fake,"entrainement + dev")
evaluation(cible_test_fake,pred_test_fake,"test")

### Combinaison

In [None]:
# Combiner les predictions
pred_train_dev = np.select([(pred_train_dev_real==0)*(pred_train_dev_fake==1),(pred_train_dev_real==1)*(pred_train_dev_fake==0)],[0,1],2)
pred_test = np.select([(pred_test_real==0)*(pred_test_fake==1),(pred_test_real==1)*(pred_test_fake==0)],[0,1],2)

In [None]:
# Evaluer le modele combine
print("Justesse entrainement + dev : {:.2f}%".format(100*accuracy_score(cible_train_dev,pred_train_dev)))
print("Justesse test : {:.2f}%".format(100*accuracy_score(cible_test,pred_test)))
ConfusionMatrixDisplay.from_predictions(cible_train_dev,pred_train_dev,normalize=None)
plt.title(f"Matrice de confusion - Données entrainement + dev")
plt.show()
plt.close()
ConfusionMatrixDisplay.from_predictions(cible_test,pred_test,normalize=None)
plt.title(f"Matrice de confusion - Données test")
plt.show()