<a href="https://colab.research.google.com/github/SeifKhdija/Indexation-des-documents-NLP/blob/main/Indexation%20NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***Installation des packages et libraries ***

In [None]:
!pip install spacy



In [None]:
!python -m spacy download fr_core_news_sm

Collecting fr_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7 MB)
[K     |████████████████████████████████| 14.7 MB 17.4 MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-py3-none-any.whl size=14727026 sha256=732789b24f6188660e7e452712209f5dbdb3e14e86bd8ad0b92d6b5d681c7c8e
  Stored in directory: /tmp/pip-ephem-wheel-cache-ldfbk41x/wheels/c9/a6/ea/0778337c34660027ee67ef3a91fb9d3600b76777a912ea1c24
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')


In [None]:
import spacy
import fr_core_news_sm
nlp=fr_core_news_sm.load()


**Creation des documents**

In [None]:
A = "Le loup est dans la bergerie"
B = "Les moutons sont dans la bergerie"
C = "Un loup a mangé un mouton et les autres loups sont restés dans la bergerie"
D = "il y a trois moutons dans le pré, et un mouton dans la gueule du loup"

**TOKENISATION**

In [None]:
def token(doc):
    d = nlp(doc)
    return [X.text for X in d]

A = token(A)
B = token(B)
C = token(C)
D = token(D)

print(A)
print(B)
print(C)
print(D)

['Le', 'loup', 'est', 'dans', 'la', 'bergerie']
['Les', 'moutons', 'sont', 'dans', 'la', 'bergerie']
['Un', 'loup', 'a', 'mangé', 'un', 'mouton', 'et', 'les', 'autres', 'loups', 'sont', 'restés', 'dans', 'la', 'bergerie']
['il', 'y', 'a', 'trois', 'moutons', 'dans', 'le', 'pré', ',', 'et', 'un', 'mouton', 'dans', 'la', 'gueule', 'du', 'loup']


**Normalisation textuelle( ponctuation et casse)**

In [None]:
A=[word.lower() for word in A if word.isalpha() ]
B=[word.lower() for word in B if word.isalpha()]
C=[word.lower() for word in C if word.isalpha()]
D=[word.lower() for word in D if word.isalpha()]

print(A)
print(B)
print(C)
print(D)

['le', 'loup', 'est', 'dans', 'la', 'bergerie']
['les', 'moutons', 'sont', 'dans', 'la', 'bergerie']
['un', 'loup', 'a', 'mangé', 'un', 'mouton', 'et', 'les', 'autres', 'loups', 'sont', 'restés', 'dans', 'la', 'bergerie']
['il', 'y', 'a', 'trois', 'moutons', 'dans', 'le', 'pré', 'et', 'un', 'mouton', 'dans', 'la', 'gueule', 'du', 'loup']


**Enlever les STOPWORDS**

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('french'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def enlever_stopwords(words):
  clean_words = []
  for token in words:
    if token not in stopWords and token not in ['a','ai','as','avons','avez','ont']:
        clean_words.append(token)
  return clean_words

A = enlever_stopwords(A)
B = enlever_stopwords(B)
C = enlever_stopwords(C)
D = enlever_stopwords(D)

print(A)
print(B)
print(C)
print(D)



['loup', 'bergerie']
['moutons', 'bergerie']
['loup', 'mangé', 'mouton', 'autres', 'loups', 'restés', 'bergerie']
['trois', 'moutons', 'pré', 'mouton', 'gueule', 'loup']


**STEMMATION**

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

def return_stem(sentence):
    return [stemmer.stem(X) for X in sentence]
A = return_stem(A)
B = return_stem(B)
C = return_stem(C)
D = return_stem(D)

print(A)
print(B)
print(C)
print(D)

['loup', 'berger']
['mouton', 'berger']
['loup', 'mang', 'mouton', 'autr', 'loup', 'rest', 'berger']
['trois', 'mouton', 'pré', 'mouton', 'gueul', 'loup']


**Term frequency d'un mot donné (TF)**

In [None]:
from collections import Counter
def term_frequency(document, mot):
      stemmot = stemmer.stem(mot)
      word_freq = Counter(document)
      for word,freq in word_freq.items():
       if(word == stemmot):
         print('Occurence: ',freq)

def tf(mot):
  print('mot :',mot)
  print('-------------------------')
  print('Document A')
  term_frequency(A,mot)
  print('-------------------------')
  print('Document B')
  term_frequency(B,mot)
  print('-------------------------')
  print('Document C')
  term_frequency(C,mot)
  print('-------------------------')
  print('Document D')
  term_frequency(D,mot)

**Quelques exemples:**

In [None]:
tf('loup')

mot : loup
-------------------------
Document A
Occurence:  1
-------------------------
Document B
-------------------------
Document C
Occurence:  2
-------------------------
Document D
Occurence:  1


In [None]:
tf('bergerie')

mot : bergerie
-------------------------
Document A
Occurence:  1
-------------------------
Document B
Occurence:  1
-------------------------
Document C
Occurence:  1
-------------------------
Document D


In [None]:
tf('mouton')

mot : mouton
-------------------------
Document A
-------------------------
Document B
Occurence:  1
-------------------------
Document C
Occurence:  1
-------------------------
Document D
Occurence:  2


**IDF d'un mot donné**

In [None]:
import math

documents =[]
documents.append(A)
documents.append(B)
documents.append(C)
documents.append(D)

def idf(mot):
  stemmot = stemmer.stem(mot)
  common_list=[]
  i =0
  for document in documents:
    common_list.append(Counter(document))
  for common in common_list:
    for word,freq in common.items():
       if(word == stemmot):
         i= i+1
  if i == 0:
    return "mot n'existe pas dans le corpus"
  else:
    return math.log(len(documents)/i)


**Quelques exemple:**

In [None]:
idf('pré')

1.3862943611198906

In [None]:
idf('loup')

0.28768207245178085

**Matrice d’incidence**

In [None]:
import pandas as pd

words = set(A).union(set(B) ).union(set(C) ).union(set(D) )

numOfWordsA = dict.fromkeys(words, 0)
for word in A:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(words, 0)
for word in B:
    numOfWordsB[word] += 1
numOfWordsC = dict.fromkeys(words, 0)
for word in C:
    numOfWordsC[word] += 1
numOfWordsD = dict.fromkeys(words, 0)
for word in D:
    numOfWordsD[word] += 1 

def matrice(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count 
    return tfDict

tfA = matrice(numOfWordsA, A)
tfB = matrice(numOfWordsB, B)
tfC = matrice(numOfWordsC, C)
tfD = matrice(numOfWordsD, D)
tfglobal = pd.DataFrame([tfA, tfB,tfC, tfD ],index=['A','B','C','D'])
tfglobal

Unnamed: 0,gueul,mang,berger,pré,loup,autr,rest,trois,mouton
A,0,0,1,0,1,0,0,0,0
B,0,0,1,0,0,0,0,0,1
C,0,1,1,0,2,1,1,0,1
D,1,0,0,1,1,0,0,1,2
