# Modélisation

### Librairies

In [None]:
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
from ..donnees.nettoyage import load_dataset, clean_dataset, add_columns
from ..donnees.utils import FakeNewsDataset, ajuster_canaux
from ..donnees.embedding import GloVeModel, tokeniser
from .modeles import CNN, CNNRNN, train_seq_fix, evaluation

## Données

In [None]:
# Importer et nettoyer les donnees
# Entrainement
data_train = load_dataset("../donnees/FakeNews_Task3_2022_V0/Task3_english_training.csv")
data_train = clean_dataset(data_train)
data_train = add_columns(data_train)
# Validation
data_dev = load_dataset("../donnees/FakeNews_Task3_2022_V0/Task3_english_dev.csv")
data_dev = clean_dataset(data_dev)
data_dev = add_columns(data_dev)
# Test
data_test = load_dataset("../donnees/FakeNews_Task3_2022_V0/English_data_test_release_with_rating.csv")
data_test = clean_dataset(data_test)
data_test = add_columns(data_test)

print(f"Entrainement : {data_train.shape[0]} | Dev : {data_dev.shape[0]} | Test : {data_test.shape[0]}")

In [None]:
# Embedding
glove = GloVeModel("../donnees/glove.6B/glove.6B.100d.txt")

In [None]:
# Jetoniser les donnees d'entrainement
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_train = tokeniser(data_train.full_text,modele=glove,pad=True)
print(tokens_train.shape)

In [None]:
# Cibles d'entrainement
cible_train_fake = torch.tensor(data_train["false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees dev
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_dev = tokeniser(data_dev.full_text,modele=glove,pad=True)
print(tokens_dev.shape)

In [None]:
# Cibles dev
cible_dev_fake = torch.tensor(data_dev["false"],dtype=torch.float32)

In [None]:
# Jetoniser les donnees de test
# Si pad=True, retourne un tenseur, sinon retourne une liste
tokens_test = tokeniser(data_test.full_text,modele=glove,pad=True)
print(tokens_test.shape)

In [None]:
# Cibles test
cible_test_fake = torch.tensor(data_test["false"],dtype=torch.float32)

In [None]:
# Validation croisée
tokens_subtrain, tokens_valid, cible_subtrain_fake, cible_valid_fake = train_test_split(tokens_train,cible_train_fake,test_size=0.2,random_state=42,stratify=cible_train_fake)
print("Sous-entrainement :",tokens_subtrain.shape)
print("Validation :",tokens_valid.shape)

In [None]:
# Creer datasets pour les modeles
max_mots = max(tokens_train.shape[1],tokens_dev.shape[1],tokens_test.shape[1])   # ATTENTION! Peut etre tres eleve, peut changer pour une valeur au choix
dataset_train_fake = FakeNewsDataset(ajuster_canaux(tokens_train,max_mots),cible_train_fake)
dataset_subtrain_fake = FakeNewsDataset(ajuster_canaux(tokens_subtrain,max_mots),cible_subtrain_fake)
dataset_valid_fake = FakeNewsDataset(ajuster_canaux(tokens_valid,max_mots),cible_valid_fake)
dataset_dev_fake = FakeNewsDataset(ajuster_canaux(tokens_dev,max_mots),cible_dev_fake)
dataset_test_fake = FakeNewsDataset(ajuster_canaux(tokens_test,max_mots),cible_test_fake)
max_mots

## Hybride CNN-RNN 
Pour séquences de texte de longueur fixe.  
Source : https://www.sciencedirect.com/science/article/pii/S2667096820300070

In [None]:
# Initialiser le modele
hybrid_fake = CNNRNN(input_size=100,in_channels=max_mots,out_channels1=1024,out_channels2=256,kernel_size=5,hidden_size=50,p_dropout=0.5,device="mps")
optimizer = optim.Adam(hybrid_fake.parameters(),lr=1e-4)
hybrid_fake

In [None]:
# Entrainer le modele
train_seq_fix(hybrid_fake,optimizer,max_epochs=10,Xy_train=dataset_subtrain_fake,Xy_val=dataset_valid_fake,taille_batch=1,melanger=True,device="mps",verbose=1)

In [None]:
# Generer des predictions
hybrid_fake.eval()
pred_train_fake = hybrid_fake.predict(dataset_train_fake.X)
pred_dev_fake = hybrid_fake.predict(dataset_dev_fake.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_fake,pred_train_fake,"entrainement")
evaluation(cible_dev_fake,pred_dev_fake,"dev")

In [None]:
torch.save(hybrid_fake.state_dict(),"../modeles/hybrid_fake.pth")

In [None]:
hybrid_fake.eval()
data_train["hybrid_score"] = hybrid_fake.predict_proba(dataset_train_fake.X)
data_dev["hybrid_score"] = hybrid_fake.predict_proba(dataset_dev_fake.X)
data_test["hybrid_score"] = hybrid_fake.predict_proba(dataset_test_fake.X)
data_train["hybrid_pred"] = hybrid_fake.predict(dataset_train_fake.X)
data_dev["hybrid_pred"] = hybrid_fake.predict(dataset_dev_fake.X)
data_test["hybrid_pred"] = hybrid_fake.predict(dataset_test_fake.X)

## CNN
Pour séquences de texte de longueur fixe. 

In [None]:
# Initialiser le modele
cnn_fake = CNN(input_size=100,in_channels=max_mots,out_channels1=512,out_channels2=128,out_channels3=32,out_channels4=8,kernel_size=3,p_dropout=(0.2,0.5),device="mps")
optimizer = optim.Adam(cnn_fake.parameters(),lr=1e-4)
cnn_fake

In [None]:
# Entrainer le modele
train_seq_fix(cnn_fake,optimizer,max_epochs=15,Xy_train=dataset_subtrain_fake,Xy_val=dataset_valid_fake,taille_batch=10,melanger=True,device="mps",verbose=1)

In [None]:
# Generer des predictions
cnn_fake.eval()
pred_train_fake = cnn_fake.predict(dataset_train_fake.X)
pred_dev_fake = cnn_fake.predict(dataset_dev_fake.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_fake,pred_train_fake,"entrainement")
evaluation(cible_dev_fake,pred_dev_fake,"dev")

In [None]:
torch.save(cnn_fake.state_dict(),"../modeles/cnn_fake.pth")

In [None]:
cnn_fake.eval()
data_train["cnn_score"] = cnn_fake.predict_proba(dataset_train_fake.X)
data_dev["cnn_score"] = cnn_fake.predict_proba(dataset_dev_fake.X)
data_test["cnn_score"] = cnn_fake.predict_proba(dataset_test_fake.X)
data_train["cnn_pred"] = cnn_fake.predict(dataset_train_fake.X)
data_dev["cnn_pred"] = cnn_fake.predict(dataset_dev_fake.X)
data_test["cnn_pred"] = cnn_fake.predict(dataset_test_fake.X)

## FFNN
Pour données au niveau "phrases". 

### Données

In [None]:
import pandas as pd
from ..donnees.embedding import TfIdf
from .modeles import FFNN

In [None]:
# Embedding Tf-Idf
modele = TfIdf(pd.concat((data_train.full_text,data_dev.full_text)),max_df=0.90,min_df=0.1)
vocab_size = modele.X.toarray().shape[1]
print("Taille du vocabulaire :",vocab_size)

In [None]:
# Entrainement
tokens_train = torch.from_numpy(modele.embedding_newdata(data_train.full_text).toarray()).type(torch.float32).requires_grad_(False)
cible_train_fake = torch.tensor(data_train["false"],dtype=torch.float32)
dataset_train_fake = FakeNewsDataset(tokens_train,cible_train_fake)
# Dev
tokens_dev = torch.from_numpy(modele.embedding_newdata(data_dev.full_text).toarray()).type(torch.float32).requires_grad_(False)
cible_dev_fake = torch.tensor(data_dev["false"],dtype=torch.float32)
dataset_dev_fake = FakeNewsDataset(tokens_dev,cible_dev_fake)
# Test
tokens_test = torch.from_numpy(modele.embedding_newdata(data_test.full_text).toarray()).type(torch.float32).requires_grad_(False)
cible_test_fake = torch.tensor(data_test["false"],dtype=torch.float32)
dataset_test_fake = FakeNewsDataset(tokens_test,cible_test_fake)
print("Entrainement :",tokens_train.shape)
print("Dev :",tokens_dev.shape)
print("Test :",tokens_test.shape)

In [None]:
# Validation croisée
tokens_subtrain, tokens_valid, cible_subtrain_fake, cible_valid_fake = train_test_split(tokens_train,cible_train_fake,test_size=0.2,random_state=42,stratify=cible_train_fake)
print("Sous-entrainement :",tokens_subtrain.shape)
print("Validation :",tokens_valid.shape)
dataset_subtrain_fake = FakeNewsDataset(tokens_subtrain,cible_subtrain_fake)
dataset_valid_fake = FakeNewsDataset(tokens_valid,cible_valid_fake)

### Modèle

In [None]:
# Initialiser le modele
ffnn_fake = FFNN(input_size=vocab_size,in_size=1024,hidden_size1=256,hidden_size2=64,hidden_size3=16,p_dropout=0.2,device="mps")
optimizer = optim.Adam(ffnn_fake.parameters(),lr=1e-4)
ffnn_fake

In [None]:
# Entrainer le modele
train_seq_fix(ffnn_fake,optimizer,max_epochs=6,Xy_train=dataset_subtrain_fake,Xy_val=dataset_valid_fake,taille_batch=1,melanger=True,device="mps",verbose=1)

In [None]:
# Generer des predictions
ffnn_fake.eval()
pred_train_fake = ffnn_fake.predict(dataset_train_fake.X)
pred_dev_fake = ffnn_fake.predict(dataset_dev_fake.X)

In [None]:
# Evaluer le modele
evaluation(cible_train_fake,pred_train_fake,"entrainement")
evaluation(cible_dev_fake,pred_dev_fake,"dev")

In [None]:
torch.save(ffnn_fake.state_dict(),"../modeles/ffnn_fake.pth")

In [None]:
ffnn_fake.eval()
data_train["ffnn_score"] = ffnn_fake.predict_proba(tokens_train)
data_dev["ffnn_score"] = ffnn_fake.predict_proba(tokens_dev)
data_test["ffnn_score"] = ffnn_fake.predict_proba(tokens_test)
data_train["ffnn_pred"] = ffnn_fake.predict(tokens_train)
data_dev["ffnn_pred"] = ffnn_fake.predict(tokens_dev)
data_test["ffnn_pred"] = ffnn_fake.predict(tokens_test)

## Combinaison

In [None]:
import numpy as np

In [None]:
data_train["agg_score"] = np.mean(data_train[["hybrid_score","cnn_score","ffnn_score"]],axis=1)
data_dev["agg_score"] = np.mean(data_dev[["hybrid_score","cnn_score","ffnn_score"]],axis=1)
data_test["agg_score"] = np.mean(data_test[["hybrid_score","cnn_score","ffnn_score"]],axis=1)

In [None]:
data_train["agg_pred"] = np.greater(data_train.agg_score,0.5).astype(int)
data_dev["agg_pred"] = np.greater(data_dev.agg_score,0.5).astype(int)
data_test["agg_pred"] = np.greater(data_test.agg_score,0.5).astype(int)

In [None]:
evaluation(data_train.false,data_train.agg_pred,"train")
evaluation(data_dev.false,data_dev.agg_pred,"dev")
evaluation(data_test.false,data_test.agg_pred,"test")

In [None]:
data_train["label"] = data_train["our rating"].map({"false": 0, "other": 1, "partially false": 2, "true": 3})
data_dev["label"] = data_dev["our rating"].map({"false": 0, "other": 1, "partially false": 2, "true": 3})
data_test["label"] = data_test["our rating"].map({"false": 0, "other": 1, "partially false": 2, "true": 3})

In [None]:
data_train[["agg_score","label"]].rename(columns={"agg_score": "score"}).to_csv("../donnees/resultats/train_scores_false.csv",index=False)
data_dev[["agg_score","label"]].rename(columns={"agg_score": "score"}).to_csv("../donnees/resultats/dev_scores_false.csv",index=False)
data_test[["agg_score","label"]].rename(columns={"agg_score": "score"}).to_csv("../donnees/resultats/test_scores_false.csv",index=False)