## Modèles et Expérimentations 

In [None]:
pip install fasttext-wheel

In [2]:
##############
#  Packages  #
##############
import os
import sys
import pickle
import torch
import warnings
import fasttext
import multiprocessing
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from typing import Dict, List, Tuple, Any, Optional
from transformers import (
    AdamW,
    DistilBertForSequenceClassification,
    CamembertForSequenceClassification,
    DistilBertTokenizerFast,
    AutoTokenizer
)

warnings.filterwarnings("ignore")
############
#  paths  #
############

root_path = Path(os.getcwd()).parents[0]
print(root_path)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')


################
#  data paths  #
################

data_path = root_path.joinpath("data")
# pretrained_vec_path = data_path.joinpath("cc.fr.300.bin.gz")

###########
#  utils  #
###########

def is_test_df(name):
    return name.split("_")[-1] == "test"

def get_train_name(test_name):
    return test_name.replace("test", "train")

C:\Users\samud\Bureau\Python code\MVA\NLP\repo\intro_NLP_projet
NVIDIA GeForce GTX 1070 with Max-Q Design
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


### Data loading

Suite aux travaux présents dans **1_Preprocessing** on dispose de 8 bases train, test avec ou sans preprocessing, gr (Humains) ou pr (OCR).

In [3]:
with open(data_path.joinpath("data_dict.pkl"), 'rb') as file:
    data_dict = pickle.load(file)

for name, df in data_dict.items():
    print(f"DataFrame {name} : shape = {df.shape} \n columns = {df.columns}\n")

DataFrame gr_raw_train : shape = (96, 3) 
 columns = Index(['sex', 'text', 'feminite_nom'], dtype='object')

DataFrame gr_raw_test : shape = (145, 3) 
 columns = Index(['sex', 'text', 'feminite_nom'], dtype='object')

DataFrame pr_raw_train : shape = (96, 3) 
 columns = Index(['sex', 'text', 'feminite_nom'], dtype='object')

DataFrame pr_raw_test : shape = (145, 3) 
 columns = Index(['sex', 'text', 'feminite_nom'], dtype='object')

DataFrame gr_proc_train : shape = (96, 6) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom'], dtype='object')

DataFrame gr_proc_test : shape = (145, 6) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom'], dtype='object')

DataFrame pr_proc_train : shape = (96, 6) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom'], dtype='object')

DataFrame pr_proc_test : shape = (145, 6) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_

## 1) un classifier statistique

Ici on test notre score de feminite du noms issue de l'information statistique de **firstname_with_sex.csv**:

In [12]:
###########
#  utils  #
###########

def stat_pred(df, thrs = 0):
    scores = df["feminite_nom"]
    pred = np.where( scores >=thrs, "femme", "homme")
    pred[np.abs(scores)<0.1] = "ambigu"
    return pred

############
#  script  #
############

for name, df in data_dict.items():
    if is_test_df(name):
        y_pred = stat_pred(df, thrs = 0)
        y_true = df["sex"]
        print(f"Stat classifier on {name} : Accuracy = ", accuracy_score(y_true, y_pred))

Stat classifier on gr_raw_test : Accuracy =  0.9724137931034482
Stat classifier on pr_raw_test : Accuracy =  0.9379310344827586
Stat classifier on gr_proc_test : Accuracy =  0.9724137931034482
Stat classifier on pr_proc_test : Accuracy =  0.9379310344827586


Avec des bons features on est déjà à **97% d'accuracy** ! La moins bonne prédiction pour pr s'explique par le bruit des données et l'imperfection de la distance de Levenshtein comme mesure de similarité des noms. Pour combiner le classifier statistique aux modèles suivants, on ajoute les prédictions de se dernier dans la colonne **genre\_nom**. Les résultats peuvent être différents entre raw et proc car les splits le sont.

In [15]:
#####################
#  ajout genre_nom  #
#####################
for name, df in data_dict.items():
    df["genre_nom"] = stat_pred(df, thrs = 0)
    print(f"DataFrame {name} : shape = {df.shape} \n columns = {df.columns}\n")

DataFrame gr_raw_train : shape = (96, 4) 
 columns = Index(['sex', 'text', 'feminite_nom', 'genre_nom'], dtype='object')

DataFrame gr_raw_test : shape = (145, 4) 
 columns = Index(['sex', 'text', 'feminite_nom', 'genre_nom'], dtype='object')

DataFrame pr_raw_train : shape = (96, 4) 
 columns = Index(['sex', 'text', 'feminite_nom', 'genre_nom'], dtype='object')

DataFrame pr_raw_test : shape = (145, 4) 
 columns = Index(['sex', 'text', 'feminite_nom', 'genre_nom'], dtype='object')

DataFrame gr_proc_train : shape = (96, 7) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom',
       'genre_nom'],
      dtype='object')

DataFrame gr_proc_test : shape = (145, 7) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom',
       'genre_nom'],
      dtype='object')

DataFrame pr_proc_train : shape = (96, 7) 
 columns = Index(['sex', 'nom', 'metier', 'lien_famille', 'civilite', 'feminite_nom',
       'genre_nom'],
      dtype='obj

## 2) Fasttext 

In [6]:
############
#  Inputs  #
############

def _join_inputs(
        df: pd.DataFrame, 
        cat_ft: List[str], 
        txt_ft: List[str], 
        sep_cat: str, 
        sep_txt: str
    ) -> pd.DataFrame:
        """
        Joins categorical and textual features in a DataFrame into a single column.
    
        Parameters:
        - df (pd.DataFrame): The input DataFrame containing the features.
        - cat_ft (List[str]): A list of column names corresponding to the categorical features.
        - txt_ft (List[str]): A list of column names corresponding to the textual features.
        - sep_cat (str): The separator to use between categories when joining.
        - sep_txt (str): The separator to use between words within each textual feature when joining.
    
        Returns:
        - pd.DataFrame: The DataFrame with a new column "joined_inputs" containing the joined inputs.
        """
        def join_from_dct(dct, sep):
            return " ".join([f"{k}{sep}{v}" for k, v in dct.items()])

        join_from_dct = np.vectorize(join_from_dct)

        def join_from_sub_df(lst, sep):
            return join_from_dct(df[lst].to_dict("records"), sep)

        add_space = lambda s: s + " "

        txt_arr = join_from_sub_df(txt_ft, sep_txt)
        txt_arr = np.vectorize(add_space)(txt_arr)

        if len(cat_ft) > 0 :
            cat_arr = join_from_sub_df(cat_ft, sep_cat)
            df["joined_inputs"] = np.char.add(txt_arr, cat_arr)
        else:
            df["joined_inputs"] = txt_arr

        return df

def _join_data(
        df: pd.DataFrame, 
        y_vars: List[str], 
        label_code: str = "__label__"
    ) -> pd.DataFrame:
        """
        Joins the output labels and input features into a single column in a DataFrame.
    
        Parameters:
        - df (pd.DataFrame): The input DataFrame containing the features and output labels.
        - y_vars (List[str]): A list of column names corresponding to the output labels.
        - label_code (str): The prefix code to use for each output label.
    
        Returns:
        - pd.DataFrame: The DataFrame with a new column "joined_data" containing the joined outputs and inputs.
        """
        def aux(dct):
            return " ".join([f"{label_code}{v}" for k, v in dct.items()])

        aux = np.vectorize(aux)

        df["joined_outputs"] = aux(df[y_vars].to_dict("records"))
        df["joined_data"] = df[["joined_outputs", "joined_inputs"]].agg(
            " ".join, axis=1
        )
        df["joined_data"] = df["joined_data"].str.replace("txt:", "")
        df = df.drop(columns=["joined_outputs"])
        return df

def _save_fasttext_inputs_file(df, path):
        df["joined_data"].to_csv(
            path,
            index=False,
            header=False,
            sep=";",
        )
        # print(f"Saving {path}")

def fasttext_transform(
    df0: pd.DataFrame, 
    filename: str, 
    cat_fts: List[str], 
    txt_fts: List[str]
) -> pd.DataFrame:
    """
    Transforms the DataFrame into a format suitable for training with FastText.

    This function preprocesses categorical and text features, joins them together, 
    and saves the resulting data to a FastText-compatible file.

    Parameters:
    - df0 (pd.DataFrame): The input DataFrame containing the features and output labels.
    - filename (str): The filename to save the FastText-compatible file.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.

    Returns:
    - pd.DataFrame: The DataFrame with transformed features and labels.
    """
    df = df0.copy()
    for cat_ft in cat_fts:
        df[cat_ft] = df[cat_ft].astype("str").fillna("")
    
    df = _join_inputs(
            df,
            cat_fts,
            txt_fts,
            ":",
            ":",
        )
    df = _join_data(
        df,
        ["sex"],
    )
    
    if filename != "":
        _save_fasttext_inputs_file(
            df, data_path.joinpath( f"{filename}.txt")
        )
    return df

#############
#  Outputs  #
#############

def fasttext_fit(
        kwargs,
        pretrained_vec_path: str = "", 
    ):
        kwargs["thread"] = int(multiprocessing.cpu_count() / 3)
        # print("Number of cores", kwargs["thread"])
        if pretrained_vec_path != "":
            kwargs["pretrainedVectors"] = pretrained_vec_path
            print("Pretrained")
        return fasttext.train_supervised(**kwargs)


def fasttext_predict(df, model_trained):
        label_code = "__label__"
        outputs = model_trained.predict(list(df["joined_inputs"]), k=1)

        def clean_outputs(k, f):
            name = ["model_codes", "model_values"][k]
            df[name] = outputs[k]
            df[name] = df[name].apply(np.vectorize(f))

        clean_outputs(0, lambda s: s.replace(label_code, ""))
        clean_outputs(1, float)
        df["model_codes"] = df["model_codes"].apply(lambda lst: lst[0])

###############
#  Pipelines  #
###############

def fast_text_pipeline(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    cat_fts: List[str], 
    txt_fts: List[str], 
    filename: str, 
    kwargs: dict, 
    pretrained_vec_path: str = ""
) -> None:
    """
    Pipeline for training and evaluating a FastText model.

    This function preprocesses the training and testing DataFrames, trains a FastText model 
    on the training data, and evaluates the model on the testing data.

    Parameters:
    - df_train (pd.DataFrame): The training DataFrame.
    - df_test (pd.DataFrame): The testing DataFrame.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.
    - filename (str): The filename to save the FastText-compatible file during preprocessing.
    - kwargs (dict): Additional keyword arguments to be passed to the `fasttext_fit` function.
    - pretrained_vec_path (str): Path to the pretrained word vectors file (optional).

    Returns:
    - None
    """
    # on selectionne en fonction du cas proc ou raw 
    vars = set(df_train.columns)
    cat_fts = list(set(cat_fts) & vars)
    txt_fts = list(set(txt_fts) & vars)

    # preprocessing avec sauvegarde
    df_train_fasttext = fasttext_transform(df_train, filename, cat_fts, txt_fts)
    df_test_fasttext = fasttext_transform(df_test, "", cat_fts, txt_fts)
    
    y_true = df_test_fasttext["sex"]
    fasttext_model = fasttext_fit(
        kwargs,
        pretrained_vec_path = pretrained_vec_path, 
    )
    fasttext_predict(df_test_fasttext, fasttext_model)
    score = accuracy_score(y_true, df_test_fasttext["model_codes"])
    print(f"Accuracy test {filename}: {score:2f} \n")

def fast_text_iter_pipeline(
    data_dict: dict,
    cat_fts: List[str], 
    txt_fts: List[str], 
    train_id: str, 
    kwargs: dict, 
    pretrained_vec_path: str = ""
) -> None:
    """
    Iteratively performs FastText pipeline on multiple datasets.

    This function iterates over the datasets provided in the `data_dict`, preprocesses 
    each dataset, and trains and evaluates a FastText model on it using the `fast_text_pipeline` function.

    Parameters:
    - data_dict (dict): A dictionary containing dataset names as keys and corresponding DataFrames as values.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.
    - train_id (str): The identifier for the training dataset in the `data_dict`.
    - kwargs (dict): Additional keyword arguments to be passed to the `fast_text_pipeline` function.
    - pretrained_vec_path (str): Path to the pretrained word vectors file (optional).

    Returns:
    - None
    """
    for name, df in data_dict.items():
        if is_test_df(name):
            pref_name = "_".join(name.split("_")[:-1])
            filename = f"{pref_name}_{train_id}"
            kwargs["input"] = str(data_path.joinpath( f"{filename}.txt"))
            df_train = data_dict[f"{pref_name}_train"]
            fast_text_pipeline(
                df_train,
                df,
                cat_fts, 
                txt_fts, 
                filename, 
                kwargs, 
                pretrained_vec_path=""
            )

Après un peu de fine tuning, on fixe les paramètres d'entrainements identiques pour toutes les comparaisons. 

In [107]:
kwargs = {
        "epoch":200, 
        "lr":0.04, 
        "dim":100,
        "wordNgrams":3,
        }

selection_to_test = [
    (["feminite_nom"], ["genre_nom", "nom", "metier", "lien_famille", "civilite","text"]),
    (["feminite_nom"], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "civilite","text"]),
    ([], ["nom",  "lien_famille", "text"]),
    ([], ["nom",  "text"]),
]

i=0
for cat_fts, txt_fts in selection_to_test:
    print("\n==========",cat_fts+txt_fts, "==========\n")
    fast_text_iter_pipeline(
        data_dict,
        cat_fts, 
        txt_fts, 
        str(i), 
        kwargs, 
        pretrained_vec_path=""
    )
    i+=1



Accuracy test gr_raw_0: 0.972414 

Accuracy test pr_raw_0: 0.937931 

Accuracy test gr_proc_0: 0.965517 

Accuracy test pr_proc_0: 0.937931 



Accuracy test gr_raw_1: 0.855172 

Accuracy test pr_raw_1: 0.524138 

Accuracy test gr_proc_1: 0.896552 

Accuracy test pr_proc_1: 0.834483 



Accuracy test gr_raw_2: 0.862069 

Accuracy test pr_raw_2: 0.593103 

Accuracy test gr_proc_2: 0.903448 

Accuracy test pr_proc_2: 0.793103 



Accuracy test gr_raw_3: 0.862069 

Accuracy test pr_raw_3: 0.593103 

Accuracy test gr_proc_3: 0.889655 

Accuracy test pr_proc_3: 0.772414 



Accuracy test gr_raw_4: 0.862069 

Accuracy test pr_raw_4: 0.593103 

Accuracy test gr_proc_4: 0.903448 

Accuracy test pr_proc_4: 0.827586 



Accuracy test gr_raw_5: 0.862069 

Accuracy test pr_raw_5: 0.593103 

Accuracy test gr_proc_5: 0.903448 

Accuracy test pr_proc_5: 0.827586 



Accuracy test gr_raw_6: 0.862069 

Accuracy test pr_raw_6: 0.593103 

Accuracy test gr_proc_6: 0.827586 

Accuracy test pr_proc_6: 0.8

### Transfert learning 

In [108]:
kwargs = {
        "epoch":200, 
        "lr":0.03, 
        "dim":5,
        "wordNgrams":3,
        }

selection_to_test = [
    (["feminite_nom"], ["genre_nom", "nom", "metier", "lien_famille", "civilite","text"]),
    (["feminite_nom"], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "civilite","text"]),
    ([], ["nom",  "lien_famille", "text"]),
    ([], ["nom",  "text"]),
]

i=0
for cat_fts, txt_fts in selection_to_test:
    print("\n==========",cat_fts+txt_fts, "==========\n")
    fast_text_iter_pipeline(
        data_dict,
        cat_fts, 
        txt_fts, 
        "p"+str(i), 
        kwargs, 
        pretrained_vec_path=pretrained_vec_path
    )
    i+=1



Accuracy test gr_raw_p0: 0.972414 

Accuracy test pr_raw_p0: 0.937931 

Accuracy test gr_proc_p0: 0.965517 

Accuracy test pr_proc_p0: 0.937931 



Accuracy test gr_raw_p1: 0.813793 

Accuracy test pr_raw_p1: 0.524138 

Accuracy test gr_proc_p1: 0.917241 

Accuracy test pr_proc_p1: 0.827586 



Accuracy test gr_raw_p2: 0.682759 

Accuracy test pr_raw_p2: 0.641379 

Accuracy test gr_proc_p2: 0.903448 

Accuracy test pr_proc_p2: 0.793103 



Accuracy test gr_raw_p3: 0.682759 

Accuracy test pr_raw_p3: 0.641379 

Accuracy test gr_proc_p3: 0.889655 

Accuracy test pr_proc_p3: 0.765517 



Accuracy test gr_raw_p4: 0.675862 

Accuracy test pr_raw_p4: 0.641379 

Accuracy test gr_proc_p4: 0.903448 

Accuracy test pr_proc_p4: 0.813793 



Accuracy test gr_raw_p5: 0.675862 

Accuracy test pr_raw_p5: 0.641379 

Accuracy test gr_proc_p5: 0.903448 

Accuracy test pr_proc_p5: 0.813793 



Accuracy test gr_raw_p6: 0.682759 

Accuracy test pr_raw_p6: 0.641379 

Accuracy test gr_proc_p6: 0.841379 

A

### 3) Transformers

On teste ici 2 architectures préentrainées de type transformers. La première est DistilBERT une version synthétique de BERT et l'autre est CamemBERT une version française de RoBERTa qui est lui même une version plus robuste de BERT.

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [8]:
############
#  Inputs  #
############

def _join_inputs(
        df: pd.DataFrame, 
        cat_ft: List[str], 
        txt_ft: List[str], 
        sep_cat: str, 
        sep_txt: str
    ) -> pd.DataFrame:
        """
        Joins categorical and textual features in a DataFrame into a single column.
    
        Parameters:
        - df (pd.DataFrame): The input DataFrame containing the features.
        - cat_ft (List[str]): A list of column names corresponding to the categorical features.
        - txt_ft (List[str]): A list of column names corresponding to the textual features.
        - sep_cat (str): The separator to use between categories when joining.
        - sep_txt (str): The separator to use between words within each textual feature when joining.
    
        Returns:
        - pd.DataFrame: The DataFrame with a new column "joined_inputs" containing the joined inputs.
        """
        def join_from_dct(dct, sep):
            return " ".join([f"{k}{sep}{v}" for k, v in dct.items()])

        join_from_dct = np.vectorize(join_from_dct)

        def join_from_sub_df(lst, sep):
            return join_from_dct(df[lst].to_dict("records"), sep)

        add_space = lambda s: s + " "

        txt_arr = join_from_sub_df(txt_ft, sep_txt)
        txt_arr = np.vectorize(add_space)(txt_arr)

        if len(cat_ft) > 0 :
            cat_arr = join_from_sub_df(cat_ft, sep_cat)
            df["joined_inputs"] = np.char.add(txt_arr, cat_arr)
        else:
            df["joined_inputs"] = txt_arr

        return df


def torch_transform(
    df0: pd.DataFrame,
    cat_fts: List[str],
    txt_fts: List[str]
) -> pd.DataFrame:
    """
    Transforms the input DataFrame into a format suitable for PyTorch training.

    This function preprocesses the input DataFrame by converting categorical features
    into string format and filling missing values. It then joins the input features
    into a single text representation, and creates a new DataFrame containing the target 
    variable ('sex') encoded as binary ('sex_bin') and the joined input text ('text').

    Parameters:
    - df0 (pd.DataFrame): The input DataFrame containing the original data.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.

    Returns:
    - pd.DataFrame: A DataFrame containing the target variable encoded as binary ('sex_bin') 
      and the joined input text ('text').
    """
    df = df0.copy()
    for cat_ft in cat_fts:
        df[cat_ft] = df[cat_ft].astype("str").fillna("")
    df = _join_inputs(
            df,
            cat_fts,
            txt_fts,
            ":",
            ":",
        )

    df2 = pd.DataFrame({"sex":  df["sex"]})
    df2["sex_bin"] = (df["sex"]=="femme").astype("int")
    df2["text"] = df["joined_inputs"]
    return df2

class GenderDataset(Dataset):
    """
    Dataset class for gender classification.

    This class preprocesses the input DataFrame, tokenizes the text features using the provided tokenizer,
    encodes the input data, and creates a dataset suitable for PyTorch training.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the original data.
    - tokenizer (PreTrainedTokenizer): The tokenizer object used to tokenize the text features.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.

    Methods:
    - __getitem__(self, idx): Gets the item at the specified index.
    - __len__(self): Returns the length of the dataset.
    """
    def __init__(self, df, tokenizer, cat_fts, txt_fts):
        vars = set(df.columns)
        cat_fts = list(set(cat_fts) & vars)
        txt_fts = list(set(txt_fts) & vars)
        df2 = torch_transform(df, cat_fts, txt_fts)
        self.encodings = tokenizer(df2["text"].to_list(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        self.labels = torch.tensor(df2["sex_bin"].to_list())

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

def get_loaders(df_train, df_test, tokenizer, cat_fts, txt_fts):
    train_dataset = GenderDataset(df_train, tokenizer, cat_fts, txt_fts)
    test_dataset = GenderDataset(df_test, tokenizer, cat_fts, txt_fts)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
    return train_loader, test_loader

####################
#  train and eval  #
####################

def train(device, train_loader, model, lr=5e-5, n_epochs = 3):
    model.to(device)
    model.train()
    optim = AdamW(model.parameters(), lr=lr)
    for epoch in range(n_epochs):
        total_loss = 0
        for batch in train_loader:
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs["loss"]
            loss.backward()
            optim.step()
            total_loss += loss.item()
        # if epoch%2==0:
        #     print(f'Epoch {epoch+1}, Loss: {total_loss:1f}')

def test(device, test_loader, model):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
            for idx, logits in enumerate(output["logits"]):
                if torch.argmax(logits) == labels[idx]:
                    correct += 1   
                total += 1
    accuracy = round(correct / total, 3)
    return accuracy

##############
#  Pipeline  #
##############

def torch_pipeline(
    device: torch.device,
    model: torch.nn.Module,
    tokenizer,
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    cat_fts: List[str], 
    txt_fts: List[str], 
    filename: str, 
    lr: float = 5e-5,
    n_epochs: int = 3
) -> None:
    """
    Torch pipeline for training and evaluating a PyTorch model for gender classification.

    This function performs the following steps:
    1. Prepares data loaders for training and testing.
    2. Trains the model on the training data for the specified number of epochs.
    3. Evaluates the trained model on the testing data and prints the accuracy.

    Parameters:
    - device (torch.device): The device on which to perform computations (e.g., "cuda" for GPU or "cpu").
    - model (torch.nn.Module): The PyTorch model to be trained and evaluated.
    - tokenizer: The tokenizer object used to tokenize the text features.
    - df_train (pd.DataFrame): The DataFrame containing the training data.
    - df_test (pd.DataFrame): The DataFrame containing the testing data.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.
    - filename (str): The name of the file or dataset being processed.
    - lr (float, optional): The learning rate for training the model (default: 5e-5).
    - n_epochs (int, optional): The number of epochs for training the model (default: 3).

    Returns:
    - None
    """
    
    train_loader, test_loader = get_loaders(df_train, df_test, tokenizer, cat_fts, txt_fts)
    train(device, train_loader, model, lr=lr, n_epochs = n_epochs)
    score = test(device, test_loader,  model)
    print(f"Accuracy test {filename}: {score:2f} \n")

def torch_iter_pipeline(
    device: torch.device,
    model: torch.nn.Module,
    tokenizer,
    data_dict: Dict[str, pd.DataFrame],
    cat_fts: List[str], 
    txt_fts: List[str], 
    train_id: str, 
    lr: float = 5e-5,
    n_epochs: int = 3
) -> None:
    """
    This function iterates over a dictionary of dataframes, where each dataframe represents a dataset
    (e.g., train, test). For each test dataset, it trains and evaluates the model using the corresponding
    training dataset.

    Parameters:
    - device (torch.device): The device on which to perform computations (e.g., "cuda" for GPU or "cpu").
    - model (torch.nn.Module): The PyTorch model to be trained and evaluated.
    - tokenizer: The tokenizer object used to tokenize the text features.
    - data_dict (Dict[str, pd.DataFrame]): A dictionary containing the datasets, where the keys represent
      the dataset names and the values are pandas DataFrames.
    - cat_fts (List[str]): A list of column names corresponding to the categorical features.
    - txt_fts (List[str]): A list of column names corresponding to the text features.
    - train_id (str): Identifier for the training dataset used in the filenames.
    - lr (float, optional): The learning rate for training the model (default: 5e-5).
    - n_epochs (int, optional): The number of epochs for training the model (default: 3).

    Returns:
    - None
    """
    for name, df in data_dict.items():
        if is_test_df(name):
            pref_name = "_".join(name.split("_")[:-1])
            filename = f"{pref_name}_{train_id}"
            df_train = data_dict[f"{pref_name}_train"]
            torch_pipeline(
                device,
                model,
                tokenizer,
                df_train,
                df,
                cat_fts, 
                txt_fts, 
                filename, 
                lr=lr,
                n_epochs = n_epochs
                
            )

In [25]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
selection_to_test = [
    (["feminite_nom"], ["genre_nom", "nom", "metier", "lien_famille", "civilite","text"]),
    (["feminite_nom"], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "civilite","text"]),
    ([], ["nom",  "lien_famille", "text"]),
    ([], ["nom",  "text"]),
]

i=0
for cat_fts, txt_fts in selection_to_test:
    print("\n==========",cat_fts+txt_fts, "==========\n")
    torch_iter_pipeline(
        device,
        model,
        tokenizer,
        data_dict,
        cat_fts, 
        txt_fts, 
        str(i), 
        lr=5e-5,
        n_epochs = 5
    )
    i+=1

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Accuracy test gr_raw_0: 0.959000 

Accuracy test pr_raw_0: 0.924000 

Accuracy test gr_proc_0: 0.986000 

Accuracy test pr_proc_0: 0.959000 



Accuracy test gr_raw_1: 0.986000 

Accuracy test pr_raw_1: 0.952000 

Accuracy test gr_proc_1: 0.986000 

Accuracy test pr_proc_1: 0.945000 



Accuracy test gr_raw_2: 0.959000 

Accuracy test pr_raw_2: 0.924000 

Accuracy test gr_proc_2: 0.952000 

Accuracy test pr_proc_2: 0.876000 



Accuracy test gr_raw_3: 0.917000 

Accuracy test pr_raw_3: 0.862000 

Accuracy test gr_proc_3: 0.938000 

Accuracy test pr_proc_3: 0.931000 



Accuracy test gr_raw_4: 0.959000 

Accuracy test pr_raw_4: 0.924000 

Accuracy test gr_proc_4: 0.972000 

Accuracy test pr_proc_4: 0.917000 



Accuracy test gr_raw_5: 0.966000 

Accuracy test pr_raw_5: 0.876000 

Accuracy test gr_proc_5: 0.966000 

Accuracy test pr_proc_5: 0.897000 



Accuracy test gr_raw_6: 0.952000 

Accuracy test pr_raw_6: 0.890000 

Accuracy test gr_proc_6: 0.952000 

Accuracy test pr_proc_6: 0.8

In [28]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
model = CamembertForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

selection_to_test = [
    (["feminite_nom"], ["genre_nom", "nom", "metier", "lien_famille", "civilite","text"]),
    (["feminite_nom"], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "civilite","text"]),
    ([], ["nom", "metier", "lien_famille", "text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "lien_famille", "civilite","text"]),
    ([], ["nom",  "civilite","text"]),
    ([], ["nom",  "lien_famille", "text"]),
    ([], ["nom",  "text"]),
]

i=0
for cat_fts, txt_fts in selection_to_test:
    print("\n==========",cat_fts+txt_fts, "==========\n")
    torch_iter_pipeline(
        device,
        model,
        tokenizer,
        data_dict,
        cat_fts, 
        txt_fts, 
        str(i), 
        lr=5e-5,
        n_epochs = 15
    )
    i+=1

You are using a model of type roberta to instantiate a model of type camembert. This is not supported for all configurations of models and can yield errors.




Accuracy test gr_raw_0: 0.986000 

Accuracy test pr_raw_0: 0.938000 

Accuracy test gr_proc_0: 0.972000 

Accuracy test pr_proc_0: 0.931000 



Accuracy test gr_raw_1: 0.972000 

Accuracy test pr_raw_1: 0.938000 

Accuracy test gr_proc_1: 0.986000 

Accuracy test pr_proc_1: 0.952000 



Accuracy test gr_raw_2: 0.917000 

Accuracy test pr_raw_2: 0.855000 

Accuracy test gr_proc_2: 0.883000 

Accuracy test pr_proc_2: 0.890000 



Accuracy test gr_raw_3: 0.931000 

Accuracy test pr_raw_3: 0.862000 

Accuracy test gr_proc_3: 0.924000 

Accuracy test pr_proc_3: 0.890000 



Accuracy test gr_raw_4: 0.897000 

Accuracy test pr_raw_4: 0.903000 

Accuracy test gr_proc_4: 0.931000 

Accuracy test pr_proc_4: 0.876000 



Accuracy test gr_raw_5: 0.917000 

Accuracy test pr_raw_5: 0.855000 

Accuracy test gr_proc_5: 0.876000 

Accuracy test pr_proc_5: 0.841000 



Accuracy test gr_raw_6: 0.917000 

Accuracy test pr_raw_6: 0.876000 

Accuracy test gr_proc_6: 0.841000 

Accuracy test pr_proc_6: 0.8

**L'analyse des résultats est réalisée dans le rapport.**