# Baseline

### Librairies

In [None]:
import numpy as np
import gensim
import torch
import torch.optim as optim
from sklearn.metrics import accuracy_score
from donnees.nettoyage import FakeNews_Task3_2022_V0
from embedding import GloVeModel, tokeniser
from modeles import LSTM, train_seq_var, evaluation

## Données

In [None]:
# Importer et nettoyer les donnees
data_train = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_training.csv","train")
data_dev = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_train_dev/Task3_english_dev.csv","dev")
data_test = FakeNews_Task3_2022_V0("./donnees/FakeNews_Task3_2022_V0/Task3_Test/English_data_test_release_with_rating.csv","test")
print(f"Entrainement : {data_train.shape[0]} | Validation : {data_dev.shape[0]} | Test : {data_test.shape[0]}")

In [None]:
data_train["full_text"] = data_train.title+" "+data_train.text
data_train["label$true"] = np.where(data_train.our_rating=="true",1,0)
data_train["label$false"] = np.where(data_train.our_rating.str.contains("false"),1,0)
data_train["label"] = np.select([data_train.our_rating.str.contains("false"),data_train.our_rating=="true"],[0,1],2)
data_dev["full_text"] = data_dev.title+" "+data_dev.text
data_dev["label$true"] = np.where(data_dev.our_rating=="true",1,0)
data_dev["label$false"] = np.where(data_dev.our_rating.str.contains("false"),1,0)
data_dev["label"] = np.select([data_dev.our_rating.str.contains("false"),data_dev.our_rating=="true"],[0,1],2)
data_test["full_text"] = data_test.title+" "+data_test.text
data_test["label$true"] = np.where(data_test.our_rating=="true",1,0)
data_test["label$false"] = np.where(data_test.our_rating.str.contains("false"),1,0)
data_test["label"] = np.select([data_test.our_rating.str.contains("false"),data_test.our_rating=="true"],[0,1],2)

In [None]:
# Embedding 
# Choisir glove ou word2Vec
glove = GloVeModel("./donnees/glove.6B/glove.6B.300d.txt")
# word2Vec = gensim.models.KeyedVectors.load_word2vec_format("./donnees/GoogleNews-vectors-negative300.bin.gz",binary=True)

In [None]:
tokens_train = tokeniser(data_train.full_text,glove)
len(tokens_train)

In [None]:
tokens_dev = tokeniser(data_dev.full_text,glove)
len(tokens_dev)

In [None]:
tokens_test = tokeniser(data_test.full_text,glove)
len(tokens_test)

## LSTM-RNN

### Prédire "real" news

In [None]:
lstm_real = LSTM(300,300,"mps")
optimizer = optim.Adam(lstm_real.parameters(), lr=1e-3)
lstm_real

In [None]:
cible_train_real = torch.tensor(data_train["label$true"],dtype=torch.float32)
cible_dev_real = torch.tensor(data_dev["label$true"],dtype=torch.float32)
train_seq_var(lstm_real,optimizer,5,tokens_train,cible_train_real,tokens_dev,cible_dev_real,"mps",verbose=1)

In [None]:
pred_train_dev_real = torch.cat((lstm_real.predict(tokens_train),lstm_real.predict(tokens_dev)))
pred_test_real = lstm_real.predict(tokens_test)
cible_train_dev_real = torch.cat((cible_train_real,cible_dev_real))
cible_test_real = torch.tensor(data_test["label$true"],dtype=torch.int)

In [None]:
evaluation(cible_train_dev_real,pred_train_dev_real,"entrainement + dev")
evaluation(cible_test_real,pred_test_real,"test")

### Prédire "fake" news

In [None]:
lstm_fake = LSTM(300,300,"mps")
optimizer = optim.Adam(lstm_fake.parameters(), lr=1e-3)
lstm_fake

In [None]:
cible_train_fake = torch.tensor(data_train["label$false"],dtype=torch.float32)
cible_dev_fake = torch.tensor(data_dev["label$false"],dtype=torch.float32)
train_seq_var(lstm_fake,optimizer,5,tokens_train,cible_train_fake,tokens_dev,cible_dev_fake,"mps",verbose=1)

In [None]:
pred_train_dev_fake = torch.cat((lstm_fake.predict(tokens_train),lstm_fake.predict(tokens_dev)))
pred_test_fake = lstm_fake.predict(tokens_test)
cible_train_dev_fake = torch.cat((cible_train_fake,cible_dev_fake))
cible_test_fake = torch.tensor(data_test["label$false"],dtype=torch.int)

In [None]:
evaluation(cible_train_dev_fake,pred_train_dev_fake,"entrainement + dev")
evaluation(cible_test_fake,pred_test_fake,"test")

### Combinaison

In [None]:
pred_train_dev = np.select([(pred_train_dev_real==0)*(pred_train_dev_fake==1),(pred_train_dev_real==1)*(pred_train_dev_fake==0)],[0,1],2)
pred_test = np.select([(pred_test_real==0)*(pred_test_fake==1),(pred_test_real==1)*(pred_test_fake==0)],[0,1],2)
cible_train_dev = torch.cat((torch.tensor(data_train["label"]),torch.tensor(data_dev["label"])))
cible_test = torch.tensor(data_test["label"])

In [None]:
print("Justesse entrainement + dev : {:.2f}%".format(100*accuracy_score(cible_train_dev,pred_train_dev)))
print("Justesse test : {:.2f}%".format(100*accuracy_score(cible_test,pred_test)))