In [1]:
!pip install gensim fasttext
!pip install python==3.8
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install transformer
!pip install datasets
!pip install accelerate==0.26.0

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313472 sha256=a9ac5412ec4929f36892a90cbf2255732763fcb15573b82bfe7c8966d575f985
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513fa6b79451473ceb7713017823c3
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

In [None]:
! unzip TP_ISD2020.zip -d data
# ! unzip scripts.zip
# ! rm -r __MACOSX/

Archive:  TP_ISD2020.zip
   creating: data/QUAERO_FrenchMed/
   creating: data/QUAERO_FrenchMed/EMEA/
  inflating: data/QUAERO_FrenchMed/EMEA/EMEAdev_layer1_ID.conll  
  inflating: data/QUAERO_FrenchMed/EMEA/EMEAtest_layer1_ID.conll  
  inflating: data/QUAERO_FrenchMed/EMEA/EMEAtrain_layer1_ID.conll  
   creating: data/QUAERO_FrenchMed/MEDLINE/
  inflating: data/QUAERO_FrenchMed/MEDLINE/MEDLINEdev_layer1_ID.conll  
  inflating: data/QUAERO_FrenchMed/MEDLINE/MEDLINEtest_layer1_ID.conll  
  inflating: data/QUAERO_FrenchMed/MEDLINE/MEDLINEtrain_layer1_ID.conll  
  inflating: data/QUAERO_FrenchMed/MEDLINE_FR_tok.ospl  
  inflating: data/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl  
   creating: data/QUAERO_FrenchPress/
  inflating: data/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl  
  inflating: data/QUAERO_FrenchPress/fra4_ID.dev  
  inflating: data/QUAERO_FrenchPress/fra4_ID.test  
  inflating: data/QUAERO_FrenchPress/fra4_ID.train  
  inflating: data/README             
Archi

In [5]:
def load_corpus(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        corpus = [line.strip().split() for line in f]
    return corpus

# Chemins des fichiers
medical_corpus_path = "data/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl"
press_corpus_path = "data/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl"

# Charger les corpus
medical_corpus = load_corpus(medical_corpus_path)
press_corpus = load_corpus(press_corpus_path)

candidate_words = ["patient", "traitement", "maladie", "solution", "jaune"]

In [6]:
from gensim.models import Word2Vec
import fasttext
import fasttext.util
from scipy.spatial.distance import cosine

def train_word2vec(corpus, sg, save_path, epochs=5):
    # Entraîner le modèle
    model = Word2Vec(
        sentences=corpus,
        vector_size=100,  # Taille des vecteurs
        window=5,         # Taille du contexte
        min_count=1,      # Fréquence minimale des mots
        sg=sg,            # 0 = CBOW, 1 = Skipgram
        workers=4,         # Nombre de threads
        epochs=epochs
    )
    # Sauvegarder le modèle
    model.save('model/' + save_path)
    print(f"Modèle Word2Vec sauvegardé dans {save_path}")

def train_fasttext(corpus_path, save_path, epochs=5):
    # Entraîner le modèle FastText (CBOW)
    model = fasttext.train_unsupervised(
        input=corpus_path,
        model='cbow',  # CBOW
        dim=100,       # Taille des vecteurs
        minCount=1,     # Fréquence minimale des mots
        epoch=epochs
    )
    # Sauvegarder le modèle
    model.save_model('model/' + save_path)
    print(f"Modèle FastText sauvegardé dans {save_path}")

def gensim_most_similar(model, word, topn=10):
    """
    Trouve les mots les plus proches d'un mot donné avec Gensim.
    :param model: Modèle Word2Vec ou FastText
    :param word: Mot de départ
    :param topn: Nombre de mots similaires à retourner
    """
    try:
        return model.wv.most_similar(word, topn=topn)
    except KeyError:
        return f"Le mot '{word}' n'est pas dans le vocabulaire du modèle."

def scipy_most_similar(model, word, topn=10):
    """
    Trouve les mots les plus proches d'un mot donné avec scipy.
    :param model: Modèle Word2Vec ou FastText
    :param word: Mot de départ
    :param topn: Nombre de mots similaires à retourner
    """
    try:
        word_vector = model.wv[word]
    except KeyError:
        return f"Le mot '{word}' n'est pas dans le vocabulaire du modèle."

    # Calculer la similarité cosinus avec tous les mots du vocabulaire
    similarities = []
    for other_word in model.wv.index_to_key:
        similarity = 1 - cosine(word_vector, model.wv[other_word])
        similarities.append((other_word, similarity))

    # Trier par similarité décroissante et retourner les topn
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:topn]


In [8]:
# Entraîner et sauvegarder les modèles Word2Vec pour choisir le meilleur nombre d'epoch
train_word2vec(medical_corpus, sg=0, save_path="word2vec_cbow_med_train1.model", epochs=1)
train_word2vec(medical_corpus, sg=0, save_path="word2vec_cbow_med_train5.model", epochs=5)
train_word2vec(medical_corpus, sg=0, save_path="word2vec_cbow_med_train20.model", epochs=20)
train_word2vec(medical_corpus, sg=0, save_path="word2vec_cbow_med_train100.model", epochs=100)

Modèle Word2Vec sauvegardé dans word2vec_cbow_med_train1.model
Modèle Word2Vec sauvegardé dans word2vec_cbow_med_train5.model
Modèle Word2Vec sauvegardé dans word2vec_cbow_med_train20.model
Modèle Word2Vec sauvegardé dans word2vec_cbow_med_train100.model


In [9]:
word2vec_cbow_med_train1 = Word2Vec.load("model/word2vec_cbow_med_train1.model")
word2vec_cbow_med_train5 = Word2Vec.load("model/word2vec_cbow_med_train5.model")
word2vec_cbow_med_train20 = Word2Vec.load("model/word2vec_cbow_med_train20.model")
word2vec_cbow_med_train100 = Word2Vec.load("model/word2vec_cbow_med_train100.model")

for word in candidate_words:
    print(f"\nRésultats pour le mot '{word}' :")
    print("Word2Vec CBOW MED(1):", gensim_most_similar(word2vec_cbow_med_train1, word))
    print("Word2Vec CBOW MED(5):", gensim_most_similar(word2vec_cbow_med_train5, word))
    print("Word2Vec CBOW MED(20):", gensim_most_similar(word2vec_cbow_med_train20, word))
    print("Word2Vec CBOW MED(100):", gensim_most_similar(word2vec_cbow_med_train100, word))

# 100 epochs semble des résultats légèrement meilleurs que 20 epochs mais
# pour réduire le temps de calcul je vais utiliser 20 epochs.


Résultats pour le mot 'patient' :
Word2Vec CBOW MED(1): [('Eli', 0.7689204812049866), ("'", 0.7671542763710022), ('.', 0.7667074799537659), ('La', 0.7638227939605713), ('pendant', 0.7633992433547974), ('le', 0.7623304128646851), ('une', 0.7620331048965454), ('l', 0.7613641619682312), ('du', 0.7605946063995361), ('les', 0.7605112195014954)]
Word2Vec CBOW MED(5): [('Le', 0.9994989037513733), ('le', 0.9994930028915405), ('La', 0.9994909763336182), ('les', 0.9994809031486511), ('par', 0.999480128288269), ('du', 0.9994780421257019), ('des', 0.9994730949401855), ('en', 0.9994546175003052), ('avec', 0.9994523525238037), ('après', 0.9994502663612366)]
Word2Vec CBOW MED(20): [('qui', 0.9853318929672241), ('délai', 0.9794530868530273), ('carte', 0.9787099361419678), ('médicament', 0.9779112339019775), ('devra', 0.9772442579269409), ('temps', 0.9770488739013672), ('lié', 0.9753610491752625), ('agir', 0.9747278094291687), ('risque', 0.9735011458396912), ('ils', 0.97335284948349)]
Word2Vec CBOW ME

In [10]:
# Entraîner et sauvegarder les modèles FastText
train_fasttext(medical_corpus_path, "fasttext_cbow_med_1.bin", epochs=1)
train_fasttext(medical_corpus_path, "fasttext_cbow_med_5.bin", epochs=5)
train_fasttext(medical_corpus_path, "fasttext_cbow_med_20.bin", epochs=20)
train_fasttext(medical_corpus_path, "fasttext_cbow_med_100.bin", epochs=100)

Modèle FastText sauvegardé dans fasttext_cbow_med_1.bin
Modèle FastText sauvegardé dans fasttext_cbow_med_5.bin
Modèle FastText sauvegardé dans fasttext_cbow_med_20.bin
Modèle FastText sauvegardé dans fasttext_cbow_med_100.bin


In [11]:
fasttext_cbow_med_1 = fasttext.load_model("model/fasttext_cbow_med_1.bin")
fasttext_cbow_med_5 = fasttext.load_model("model/fasttext_cbow_med_5.bin")
fasttext_cbow_med_20 = fasttext.load_model("model/fasttext_cbow_med_20.bin")
fasttext_cbow_med_100 = fasttext.load_model("model/fasttext_cbow_med_100.bin")

for word in candidate_words:
    print(f"\nRésultats pour le mot '{word}' :")
    print(fasttext_cbow_med_1.get_nearest_neighbors(word))
    print(fasttext_cbow_med_5.get_nearest_neighbors(word))
    print(fasttext_cbow_med_20.get_nearest_neighbors(word))
    print(fasttext_cbow_med_100.get_nearest_neighbors(word))

# 10 epochs semble avoir les meilleurs resultats


Résultats pour le mot 'patient' :
[(0.9402261972427368, 'patiente'), (0.9213680028915405, 'patients'), (0.8477667570114136, 'entrainement'), (0.8382683992385864, 'appartient'), (0.8129658102989197, 'département'), (0.8122319579124451, 'Traitements'), (0.8105729222297668, 'inventaires'), (0.8060504794120789, 'pigmentaires'), (0.8058280944824219, 'conscient'), (0.8043614029884338, 'Traitement')]
[(1.000000238418579, 'ventriculaires'), (1.000000238418579, 'avacement'), (1.000000238418579, 'cohérent'), (1.000000238418579, 'isolement'), (1.0000001192092896, 'comment'), (1.0000001192092896, 'retentissement'), (1.0000001192092896, 'spécifiquement'), (1.0000001192092896, 'Allaitement'), (1.0, 'intravasculaires'), (1.0, 'paramétriales')]
[(0.9993148446083069, 'entrainement'), (0.999184787273407, 'Accompagnement'), (0.9991169571876526, 'récemment'), (0.9990652799606323, 'comportement'), (0.9990555047988892, 'soulagement'), (0.9990339875221252, 'glissement'), (0.9990159869194031, 'socialement'),

In [12]:
# Entraîner et sauvegarder les modèles Word2Vec
train_word2vec(medical_corpus, sg=0, save_path="word2vec_cbow_med.model", epochs=20)
train_word2vec(medical_corpus, sg=1, save_path="word2vec_skipgram_med.model", epochs=20)
train_word2vec(press_corpus, sg=0, save_path="word2vec_cbow_press.model", epochs=20)
train_word2vec(press_corpus, sg=1, save_path="word2vec_skipgram_press.model", epochs=20)
train_fasttext(medical_corpus_path, "fasttext_cbow_med.bin", epochs=10)
train_fasttext(press_corpus_path, "fasttext_cbow_press.bin", epochs=10)

Modèle Word2Vec sauvegardé dans word2vec_cbow_med.model
Modèle Word2Vec sauvegardé dans word2vec_skipgram_med.model
Modèle Word2Vec sauvegardé dans word2vec_cbow_press.model
Modèle Word2Vec sauvegardé dans word2vec_skipgram_press.model
Modèle FastText sauvegardé dans fasttext_cbow_med.bin
Modèle FastText sauvegardé dans fasttext_cbow_press.bin


In [13]:
word2vec_cbow_med = Word2Vec.load("model/word2vec_cbow_med.model")
word2vec_skipgram_med = Word2Vec.load("model/word2vec_skipgram_med.model")
word2vec_cbow_press = Word2Vec.load("model/word2vec_cbow_press.model")
word2vec_skipgram_press = Word2Vec.load("model/word2vec_skipgram_press.model")

fasttext_cbow_med = fasttext.load_model("model/fasttext_cbow_med.bin")
fasttext_cbow_press = fasttext.load_model("model/fasttext_cbow_press.bin")

for word in candidate_words:
    print(f"\nRésultats pour le mot '{word}' :")
    print("Word2Vec CBOW (medical):", gensim_most_similar(word2vec_cbow_med, word))
    print("Word2Vec Skipgram (medical):", gensim_most_similar(word2vec_skipgram_med, word))
    print("Word2Vec CBOW (non-medical):", gensim_most_similar(word2vec_cbow_press, word))
    print("Word2Vec Skipgram (non-medical):", gensim_most_similar(word2vec_skipgram_press, word))
    print("FastText CBOW (medical):", fasttext_cbow_med.get_nearest_neighbors(word))
    print("FastText CBOW (non-medical):", fasttext_cbow_press.get_nearest_neighbors(word))


Résultats pour le mot 'patient' :
Word2Vec CBOW (medical): [('qui', 0.9789780974388123), ('délai', 0.9768372774124146), ('allaitement', 0.9753565788269043), ('risque', 0.9750105142593384), ('tant', 0.9746490716934204), ('arrêter', 0.9733835458755493), ('médicament', 0.973052978515625), ('devra', 0.9729708433151245), ('machines', 0.9724516868591309), ('carte', 0.9705780744552612)]
Word2Vec Skipgram (medical): [('carte', 0.9145078063011169), ('aptitude', 0.8815526962280273), ('spéciale', 0.8805074095726013), ('aucun', 0.8788996338844299), ('examiner', 0.8781753778457642), ('présentent', 0.8779392242431641), ('alerte', 0.8778810501098633), ('avoir', 0.8735847473144531), ('Montrez', 0.8715653419494629), ('prescrit', 0.8668498396873474)]
Word2Vec CBOW (non-medical): [('concessionnaire', 0.6445181965827942), ('costumé', 0.6205944418907166), ('morceau', 0.5985224843025208), ('troupeau', 0.593877375125885), ('1936', 0.5876890420913696), ('keuf', 0.5870494842529297), ('tranchant', 0.5865280032

In [None]:
# ! zip model.zip model/*

  adding: model/fasttext_cbow_med_100.bin (deflated 87%)
  adding: model/fasttext_cbow_med_1.bin (deflated 87%)
  adding: model/fasttext_cbow_med_20.bin (deflated 87%)
  adding: model/fasttext_cbow_med_5.bin (deflated 87%)
  adding: model/fasttext_cbow_med.bin (deflated 87%)
  adding: model/fasttext_cbow_press.bin (deflated 80%)
  adding: model/word2vec_cbow_med.model (deflated 10%)
  adding: model/word2vec_cbow_med_train100.model (deflated 10%)
  adding: model/word2vec_cbow_med_train1.model (deflated 9%)
  adding: model/word2vec_cbow_med_train20.model (deflated 10%)
  adding: model/word2vec_cbow_med_train5.model (deflated 10%)
  adding: model/word2vec_cbow_press.model (deflated 10%)
  adding: model/word2vec_skipgram_med.model (deflated 10%)
  adding: model/word2vec_skipgram_press.model (deflated 10%)
