# Importation des bibliothéques

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize,WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
import spacy
from nltk.stem.snowball import FrenchStemmer


# Imporation et Fussionnement des documents

In [2]:
import os
path=r"C:\Users\21652\Desktop\TAl\data"
r=[f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f))]
def create_data(r):
    data=[]
    for i in r:
        p=path+'\\'+i
        d=open(p,'r')
        data.append(d.read().lower())
    return data
data=create_data(r)



# Tokenization & droping stop-words & Lemmatisation & Stremming

In [3]:
'''
Fonction qui permet de séparer une séquence (texte) en une liste de tokens (mots)
'''
def tokenization(data):
    tk = WhitespaceTokenizer()
    data_words=[tk.tokenize(row) for row in data ]
    return data_words
'''
Fonction de nettoyage des stop words et des punctuations
'''
def clean(doc):
    #crow=[r for r in doc if r.isalpha()]
    crow=[re.sub(r"[]"+ string.punctuation + r"]", " ",r) for r in doc]
    crow=[re.sub(r"[0-9©@~&£$*%§°#’™·•–®]"," ",w) for w in crow]
    crow=[re.sub(r'\s+', ' ',  w) for w in crow]
    mysp=stopwords.words('french')+stopwords.words('english')
    crow=[w for w in crow if (w not in mysp) & (w !=' ') & (w not in ["tout","plus","cet","chez","d un","ci","celle","…","d une","celui","dun","dans"])]
    return crow
'''
Fonction de lemmatization des mots
'''
def lemmatization(doc):
    lemmatizer = WordNetLemmatizer()
    cdoc=[lemmatizer.lemmatize(w, pos='v') for w in doc]
    return cdoc
'''
Fonctions de Racinisation
'''
def stremmin(doc):
    sb = FrenchStemmer()
    return [sb.stem(w) for w in doc]
#appel des fonctions
data1=tokenization(data)
data1=[lemmatization(doc) for doc in data1]
data1=[stremmin(doc) for doc in data1]
data1=[clean(doc) for doc in data1]



# Extraction de la fréquence des termes simples

In [4]:
data_frequences=[FreqDist(doc) for doc in data1 ]


# Extraction des termes composés

# Méthode Rake

In [5]:
from rake_nltk import Rake
# Exractions des keyphrases et calcul de score en fonction de la fréquence des mots et des cooccurrences
rake = Rake(stopwords=stopwords.words('french')+stopwords.words('english'), max_length=4)

keyphrases=[]
for doc in data1:
    rake.extract_keywords_from_sentences(doc)
    rake_keyphrases =rake.get_ranked_phrases_with_scores()
    keyphrases.append({t[1]:t[0] for t in rake_keyphrases if t[1].find(' ')!=-1})



# Méthode spacy

In [53]:
import spacy.cli
spacy.cli.download("fr_core_news_sm")

✔ Download and installation successful
You can now load the package via spacy.load('fr_core_news_sm')


In [54]:
import spacy
import pytextrank

#cleaned data
data2=[str(" ".join(str(w) for w in doc)) for doc in data1]

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("fr_core_news_sm")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
for text in data2:
    doc=nlp(text)
# examine the top-ranked phrases in the document
keyphrases=set()
for phrase in doc._.phrases:
    keyphrases.add(phrase.text)
keyphrases

{'l induct com',
 'malad digest',
 'quel class',
 'tomodensitometr cérébral',
 'milieu',
 'canal ouvert  ',
 'dialys méthanol éthylen',
 'pa trois mois',
 'afin permettr',
 'second édit',
 ' hémorrag',
 'aer',
 'dan nombreux circonstances',
 'relev',
 'enregistrement',
 'sl',
 'tension artériel',
 'l admiss',
 'l icann  intern corpor',
 'extract maximale  ',
 'sinus cavern',
 'malad compos',
 'dan nombreux cas',
 'l étrang professionnel',
 'régul urgences  ',
 'rach plaqu motric variable  hétérogéné',
 'stad',
 'model expérimental',
 'numéris coupl',
 'paroxyst',
 'optiqu',
 'surfac',
 'l expériment animal',
 'l angiograph',
 'fibrill ventriculair',
 'admission',
 'connaiss lim pieg',
 'hyperprotéinorachie',
 'médic dan survenu tvc',
 'difficil correl',
 'rapport troubl métabolique  ',
 'aiguës',
 'gérontolog  ',
 'le diarrh',
 'comprend habituel',
 'autr médic interfer transmiss neuromusculair',
 'mêm démarch diagnost',
 'place',
 'cathet long paroi vasculair',
 'compos montag',
 'l i

# Méthode yake

In [55]:
from yake import KeywordExtractor as Yake
#cleaned data
data2=[str(" ".join(str(w) for w in doc)) for doc in data1]
yake = Yake(lan='fr')
yake_keyphrases=[]
for doc in data2:
    yake_keyphrases.append(yake.extract_keywords(doc))

yake_keyphrases

[[('malad trait vni', 1.1765335478296876e-07),
  ('trait vni malad', 2.8759708946947927e-07),
  ('vni malad trait', 2.8759708946947927e-07),
  ('dan group ventil', 3.2052104061245403e-07),
  ('courb pression volum', 3.322721890588002e-07),
  ('pression dan voi', 3.4088635665650596e-07),
  ('dan group malad', 3.6995577142453463e-07),
  ('insuffis respiratoir aigu', 4.586543577226987e-07),
  ('cour ventil mécan', 4.6369052736982713e-07),
  ('patient atteint bpco', 4.987627101432221e-07),
  ('utilis vni malad', 5.283376192382639e-07),
  ('dan étud contrôl', 5.497059655496522e-07),
  ('group patient ventil', 5.611662951393337e-07),
  ('patient ventil ech', 5.690954490178647e-07),
  ('dan group patient', 5.940902700786524e-07),
  ('dan trait malad', 6.04188768354008e-07),
  ('dan group vni', 6.460825013069689e-07),
  ('utilis vni dan', 6.461251631857553e-07),
  ('pression expiratoir posit', 6.482847941884545e-07),
  ('élev dan group', 6.54376165571431e-07)],
 [('pression artériel pulmonair'