# TF-IDF

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentNLP = """Le traitement automatique du langage naturel (abr. TALN), ou traitement automatique de la langue naturelle, 
 ou encore traitement automatique des langues (abr. TAL) est un domaine multidisciplinaire impliquant la linguistique,
 l'informatique et l'intelligence artificielle, qui vise à créer des outils de traitement de la langue naturelle pour diverses applications.
 Il ne doit pas être confondu avec la linguistique informatique, qui vise à comprendre les langues au moyen d'outils informatiques.
 Le TALN est sorti des laboratoires de recherche pour être progressivement mis en œuvre dans des applications informatiques nécessitant
 l'intégration du langage humain à la machine. Aussi le TALN est-il parfois appelé ingénierie linguistique.
 En France, le traitement automatique de la langue naturelle a sa revue, Traitement automatique des langues,
 publiée par l’Association pour le traitement automatique des langues (ATALA)."""

documentDataScience = """En termes généraux, la science des données est l'extraction de connaissance d'ensembles de données.
 Elle emploie des techniques et des théories tirées de plusieurs autres domaines plus larges des mathématiques, analyse,
 optimisation et statistique principalement, la théorie de l'information et la technologie de l'information, notamment le traitement de signal,
 des modèles probabilistes, l'apprentissage automatique, l'apprentissage statistique, la programmation informatique, l'ingénierie de données,
 la reconnaissance de formes et l'apprentissage, la visualisation, l'analytique prophétique, la modélisation d'incertitude, le stockage de données,
 la géo-visualisation, la compression de données et le calcul à haute performance.
 Les méthodes qui s'adaptent aux données de masse sont particulièrement intéressantes dans la science des données,
 bien que la discipline ne soit généralement pas considérée comme limitée à ces données.

La science des données (en anglais data science) est une discipline qui s'appuie sur des outils mathématiques, de statistiques,
 d'informatique (cette science est principalement une « science des données numériques ») et de visualisation des données.
 Elle est en plein développement, dans le monde universitaire ainsi que dans le secteur privé et le secteur public.
 Moore en 1991 a défini la statistique comme la science des données6 (définition reprise par d'autres dont James T. McClave et al. en 1997)
 et U. Beck en 2001 oppose la science des données à la science de l'expérience, voyant une dissociation croissante entre ces deux types de science,
 que tendrait selon lui à encourager une société de la gestion du risque au sein d'une « civilisation du danger »."""

documentEconometrie = """L'économétrie est une branche de la science économique qui a pour objectif d'estimer et de tester les modèles économiques.
 L'économétrie en tant que discipline naît dans les années 1930 avec la création de la société d'économétrie par Irving Fisher et Ragnar Frisch (1930)
 et la création de la revue Econometrica (1933). Depuis lors, l'économétrie n'a cessé de se développer et de prendre une importance croissante
 au sein de la science économique.

L'économétrie théorique se focalise essentiellement sur deux questions, l'identification et l'estimation statistique. L'économétrie appliquée
 utilise les méthodes économétriques pour comprendre des domaines de l'économie comme l'analyse du marché du travail,
 l'économie de l'éducation ou encore tester la pertinence empirique des modèles de croissance.

L'économétrie appliquée utilise aussi bien des données issues d'un protocole expérimental, que ce soit une expérience de laboratoire
 ou une expérience de terrain, que des données issues directement de l'observation du réel sans manipulation du chercheur.
 Lorsque l'économètre utilise des données issues directement de l'observation du réel, il est fréquent d'identifier des expériences naturelles
 pour retrouver une situation quasi-expérimentale. On parle parfois de révolution de crédibilité, terme controversé, pour désigner l'essor fulgurant
 de ces méthodes de recherche dans la discipline, et en économie en général."""

documentHistoire = """L’histoire, souvent écrit avec la première lettre majuscule,
 est à la fois l’étude et l'écriture des faits et des événements passés quelles que soient leur variété et leur complexité.
 L'histoire est également une science humaine et sociale. On désigne aussi couramment sous le terme d’histoire (par synecdoque) le passé lui-même,
 comme dans les leçons de l'histoire. L'histoire est un récit écrit par lequel des hommes et des femmes (les historiens et historiennes)
 s'efforcent de faire connaître les temps révolus. Ces tentatives ne sont jamais entièrement indépendantes de conditionnements étrangers au domaine
 telle que la vision du monde de leur auteur ou de sa culture, mais elles sont censées être élaborées à partir de sources plutôt que guidées
 par la spéculation ou l'idéologie.

Au cours des siècles, les historiens ont façonné leurs méthodes ainsi que les champs d'intervention, tout en réévaluant leurs sources,
 leur origine et leur exploitation. La discipline universitaire d'étude et écriture de l'histoire, y comprise la critique des méthodes,
 est l'historiographie. Elle s'appuie sur diverses sciences auxiliaires complétant selon les travaux menés la compétence générale de l'historien.
 Elle reste malgré tout une construction humaine, inévitablement inscrite dans son époque, susceptible d'être utilisée en dehors de son domaine,
 notamment à des fins d'ordre politique. 
"""


In [3]:
bagOfWordsNLP = documentNLP.split(' ')
bagOfWordsDataScience = documentDataScience.split(' ')
bagOfWordsEconometrie = documentEconometrie.split(' ')
bagOfWordsHistoire = documentHistoire.split(' ')

In [4]:
uniqueWords = set(bagOfWordsNLP).union(set(bagOfWordsDataScience)).union(set(bagOfWordsEconometrie)).union(set(bagOfWordsHistoire))

In [5]:
numOfWordsNLP = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsNLP:
    numOfWordsNLP[word] += 1

numOfWordsDataScience = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsDataScience:
    numOfWordsDataScience[word] += 1

numOfWordsEconometrie = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsEconometrie:
    numOfWordsEconometrie[word] += 1

numOfWordsHistoire = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsHistoire:
    numOfWordsHistoire[word] += 1

In [12]:
from nltk.corpus import stopwords

# nltk.download('stopwords')
stopwords.words('french')

['au',
 'aux',
 'avec',
 'ce',
 'ces',
 'dans',
 'de',
 'des',
 'du',
 'elle',
 'en',
 'et',
 'eux',
 'il',
 'ils',
 'je',
 'la',
 'le',
 'les',
 'leur',
 'lui',
 'ma',
 'mais',
 'me',
 'même',
 'mes',
 'moi',
 'mon',
 'ne',
 'nos',
 'notre',
 'nous',
 'on',
 'ou',
 'par',
 'pas',
 'pour',
 'qu',
 'que',
 'qui',
 'sa',
 'se',
 'ses',
 'son',
 'sur',
 'ta',
 'te',
 'tes',
 'toi',
 'ton',
 'tu',
 'un',
 'une',
 'vos',
 'votre',
 'vous',
 'c',
 'd',
 'j',
 'l',
 'à',
 'm',
 'n',
 's',
 't',
 'y',
 'été',
 'étée',
 'étées',
 'étés',
 'étant',
 'étante',
 'étants',
 'étantes',
 'suis',
 'es',
 'est',
 'sommes',
 'êtes',
 'sont',
 'serai',
 'seras',
 'sera',
 'serons',
 'serez',
 'seront',
 'serais',
 'serait',
 'serions',
 'seriez',
 'seraient',
 'étais',
 'était',
 'étions',
 'étiez',
 'étaient',
 'fus',
 'fut',
 'fûmes',
 'fûtes',
 'furent',
 'sois',
 'soit',
 'soyons',
 'soyez',
 'soient',
 'fusse',
 'fusses',
 'fût',
 'fussions',
 'fussiez',
 'fussent',
 'ayant',
 'ayante',
 'ayantes',


In [13]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [19]:
tfNLP = computeTF(numOfWordsNLP, bagOfWordsNLP)
tfDataScience = computeTF(numOfWordsDataScience, bagOfWordsDataScience)
tfEconometrie = computeTF(numOfWordsEconometrie, bagOfWordsEconometrie)
tfHistoire = computeTF(numOfWordsHistoire, bagOfWordsHistoire)

In [20]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [21]:
idfs = computeIDF([numOfWordsNLP, numOfWordsDataScience, numOfWordsEconometrie, numOfWordsHistoire])

In [22]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [23]:
tfidfNLP = computeTFIDF(tfNLP, idfs)
tfidfDataScience = computeTFIDF(tfDataScience, idfs)
tfidfEconometrie = computeTFIDF(tfEconometrie, idfs)
tfidfHistoire = computeTFIDF(tfHistoire, idfs)
df = pd.DataFrame([tfidfNLP, tfidfDataScience, tfidfEconometrie])

In [24]:
print(df)

   (définition   l'essor  traitement  ...  femmes  l'expérience,       les
0     0.000000  0.000000    0.031991  ...     0.0       0.000000  0.002213
1     0.005658  0.000000    0.002829  ...     0.0       0.005658  0.000000
2     0.000000  0.007037    0.000000  ...     0.0       0.000000  0.004381

[3 rows x 413 columns]


In [25]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentNLP, documentDataScience, documentEconometrie, documentHistoire])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [26]:
print(df)

       1930      1933      1991  ...  événements      être     œuvre
0  0.000000  0.000000  0.000000  ...     0.00000  0.106778  0.067717
1  0.000000  0.000000  0.043496  ...     0.00000  0.000000  0.000000
2  0.104828  0.052414  0.000000  ...     0.00000  0.000000  0.000000
3  0.000000  0.000000  0.000000  ...     0.05785  0.091219  0.000000

[4 rows x 342 columns]


In [27]:
df.loc[0]

1930          0.000000
1933          0.000000
1991          0.000000
1997          0.000000
2001          0.000000
                ...   
étrangers     0.000000
étude         0.000000
événements    0.000000
être          0.106778
œuvre         0.067717
Name: 0, Length: 342, dtype: float64

In [28]:
dist_NLP_DataScience = np.linalg.norm(df.loc[0] - df.loc[1])
dist_NLP_Econometrie = np.linalg.norm(df.loc[0] - df.loc[2])
dist_NLP_Histoire = np.linalg.norm(df.loc[0] - df.loc[3])
dist_DataScience_Econometrie = np.linalg.norm(df.loc[1] - df.loc[2])
dist_DataScience_Histoire = np.linalg.norm(df.loc[1] - df.loc[3])
dist_Econometrie_Histoire = np.linalg.norm(df.loc[2] - df.loc[3])

In [29]:
print(dist_NLP_DataScience)
print(dist_NLP_Econometrie)
print(dist_NLP_Histoire)
print(dist_DataScience_Econometrie)
print(dist_DataScience_Histoire)
print(dist_Econometrie_Histoire)

1.1673312267944298
1.2190626386442045
1.2430371781718232
0.9948497561633074
1.1000016429145862
1.144129897552517


# Embeddings de mots

## Word2Vec

In [30]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

import gensim 
from gensim.models import Word2Vec 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [31]:
texte1 = """L’exploration de données, connue aussi sous l'expression de fouille de données, forage de données, prospection de données, data mining,
 ou encore extraction de connaissances à partir de données, a pour objet l’extraction d'un savoir ou d'une connaissance à partir de grandes quantités
 de données, par des méthodes automatiques ou semi-automatiques.
Elle se propose d'utiliser un ensemble d'algorithmes issus de disciplines scientifiques diverses telles que les statistiques,
 l'intelligence artificielle ou l'informatique, pour construire des modèles à partir des données,
 c'est-à-dire trouver des structures intéressantes ou des motifs selon des critères fixés au préalable,
 et d'en extraire un maximum de connaissances.
L'utilisation industrielle ou opérationnelle de ce savoir dans le monde professionnel permet de résoudre des problèmes très divers,
 allant de la gestion de la relation client à la maintenance préventive, en passant par la détection de fraudes ou encore l'optimisation de sites web.
C'est aussi le mode de travail du journalisme de données.
L'exploration de données fait suite, dans l'escalade de l'exploitation des données de l'entreprise, à l'informatique décisionnelle.
Celle-ci permet de constater un fait, tel que le chiffre d'affaires, et de l'expliquer comme le chiffre d'affaires décliné par produits,
 tandis que l'exploration de données permet de classer les faits et de les prévoir dans une certaine mesure ou encore de les éclairer en révélant
 par exemple les variables ou paramètres qui pourraient faire comprendre pourquoi le chiffre d'affaires de tel point de vente est supérieur
 à celui de tel autre. """

texte2 = """En statistique, les analyses multivariées ont pour caractéristique de s'intéresser à des lois de probabilité à plusieurs variables.
Les analyses bivariées sont des cas particuliers à deux variables.
Les analyses multivariées sont très diverses selon l'objectif recherché, la nature des variables et la mise en œuvre formelle.
On peut identifier deux grandes familles : celle des méthodes descriptives (visant à structurer et résumer l'information)
 et celle des méthodes explicatives visant à expliquer une ou des variables dites « dépendantes » (variables à expliquer) par un ensemble de variables
 dites « indépendantes » (variables explicatives).
Les méthodes appelées en français analyse des données en sont un sous-ensemble. """

In [32]:
data = []

# iterate through each sentence in the file 
for i in sent_tokenize(texte1 + texte2): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp)

In [33]:
print(data)

[['l', '’', 'exploration', 'de', 'données', ',', 'connue', 'aussi', 'sous', "l'expression", 'de', 'fouille', 'de', 'données', ',', 'forage', 'de', 'données', ',', 'prospection', 'de', 'données', ',', 'data', 'mining', ',', 'ou', 'encore', 'extraction', 'de', 'connaissances', 'à', 'partir', 'de', 'données', ',', 'a', 'pour', 'objet', 'l', '’', 'extraction', "d'un", 'savoir', 'ou', "d'une", 'connaissance', 'à', 'partir', 'de', 'grandes', 'quantités', 'de', 'données', ',', 'par', 'des', 'méthodes', 'automatiques', 'ou', 'semi-automatiques', '.'], ['elle', 'se', 'propose', "d'utiliser", 'un', 'ensemble', "d'algorithmes", 'issus', 'de', 'disciplines', 'scientifiques', 'diverses', 'telles', 'que', 'les', 'statistiques', ',', "l'intelligence", 'artificielle', 'ou', "l'informatique", ',', 'pour', 'construire', 'des', 'modèles', 'à', 'partir', 'des', 'données', ',', "c'est-à-dire", 'trouver', 'des', 'structures', 'intéressantes', 'ou', 'des', 'motifs', 'selon', 'des', 'critères', 'fixés', 'au',

### Continuous bag of words (CBOW)

Le modèle CBOW prédit le mot courant étant donné les mots de contexte dans une fenêtre autour du mot courant.

In [34]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5,
                              sg = 0)

# Print results
print("Cosine similarity between 'données' " + 
               "and 'connaissance' - CBOW : ", 
    model1.wv.similarity('données', 'connaissance'))

Cosine similarity between 'données' and 'connaissance' - CBOW :  0.03347106


  if np.issubdtype(vec.dtype, np.int):


### Skip gram

La méthode "skip gram" fait le contraire de ce que fait la méthode "cbow" : elle prédit les mots de contexte d'un mot donné.

In [35]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1,
                                size = 100, window = 5,
                                sg = 1) 

# Print results 
print("Cosine similarity between 'données' " +
          "and 'connaissance' - Skip Gram : ", 
    model2.wv.similarity('données', 'connaissance')) 

Cosine similarity between 'données' and 'connaissance' - Skip Gram :  0.083709136


  if np.issubdtype(vec.dtype, np.int):


In [36]:
word_vectors = model1.wv

In [37]:
print(word_vectors['données'])

[-4.33718786e-03  1.75855705e-03  9.17808735e-04  2.81643821e-03
  1.99199794e-03 -1.61679613e-03 -3.80598963e-03  3.38992360e-03
 -2.49176705e-03 -2.87178392e-03  1.97654660e-03  3.97887453e-03
 -1.64196594e-03  2.37140525e-03  1.08697882e-03 -1.63304631e-03
 -2.12068786e-03 -4.05492214e-03  3.76474462e-03 -4.62627644e-03
  4.94506164e-03  4.41957824e-03  1.76363857e-03 -4.63162456e-03
  4.29786835e-03  6.06227317e-04  1.44860230e-03 -5.17087989e-03
  4.10126522e-04  6.55350625e-04  2.99918070e-03 -1.30605232e-03
 -2.55716965e-04  8.03350064e-04  3.29441065e-03 -4.73395037e-03
  3.17598484e-03 -1.62105437e-03 -2.05177651e-03 -3.10599175e-03
  3.82131664e-03 -3.72754061e-04  4.59108036e-03 -3.37163615e-03
 -1.83012278e-03 -2.08565313e-03  1.97788142e-03 -2.61708372e-03
 -3.93491471e-03 -3.84520786e-03  7.93123210e-04  1.77513775e-05
  2.78521422e-03 -1.52845529e-03  4.06482536e-03  8.67334544e-04
  3.24610039e-03 -2.18762521e-04 -1.92727684e-03 -1.06676962e-04
 -2.79278448e-03 -6.25226

In [38]:
import gensim.downloader as api

# api.info("glove-wiki-gigaword-100")
word_vectors = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors from gensim-data



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [39]:
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

queen: 0.7699


  if np.issubdtype(vec.dtype, np.int):


In [48]:
result = word_vectors.most_similar(positive=['female', 'lion'], negative=['male'])
print("{}: {:.4f}".format(*result[0]))

dragon: 0.6694


  if np.issubdtype(vec.dtype, np.int):


In [49]:
print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


In [50]:
result = word_vectors.similar_by_word("cat")
print("{}: {:.4f}".format(*result[0]))

dog: 0.8798


  if np.issubdtype(vec.dtype, np.int):


### FastText

In [None]:
from gensim.models import FastText

# from gensim.test.utils import common_texts
# print(common_texts[0])
# ['human', 'interface', 'computer']

model = FastText(data, size=100, window=5, min_count=5, workers=4, sg=1)

In [None]:
model.wv.most_similar("données")

  if np.issubdtype(vec.dtype, np.int):


[('des', 0.18751689791679382),
 (',', 0.09106265753507614),
 ('la', 0.03603788837790489),
 ('les', 0.031714729964733124),
 ('un', 0.020188312977552414),
 ('en', 0.013541869819164276),
 ('le', -0.006070766597986221),
 ('variables', -0.019463203847408295),
 ('par', -0.029420850798487663),
 ('et', -0.040456950664520264)]

In [None]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')



In [None]:
ft.get_dimension()

300

In [None]:
# fasttext.util.reduce_model(ft, 100)
# ft.get_dimension()

In [None]:
ft.get_word_vector('hello').shape

(300,)

In [None]:
ft.get_nearest_neighbors('hello')

[(0.6911550760269165, 'hello.'),
 (0.6733187437057495, 'hellow'),
 (0.6578026413917542, 'hi'),
 (0.6480079293251038, 'hello-'),
 (0.6307998895645142, 'hello.I'),
 (0.6276512145996094, 'hullo'),
 (0.6193587779998779, 'hallo'),
 (0.6185808777809143, 'howdy'),
 (0.600105881690979, 'hellooooo'),
 (0.5991216897964478, 'hellos')]

## Embeddings keras

In [3]:
import keras
import spacy
from keras.preprocessing.text import one_hot

Using TensorFlow backend.


Création d'un corpus d'exemple

In [4]:
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

Encodage du corpus en one-hot à l'aide de la fonction keras.

In [5]:
vocab_size=50 
encod_corp=[]
for i,doc in enumerate(corp):
    # taille de vocab 50 pour être sur que chaque mot est encodé sur un entier unique.
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [2, 40, 38, 1, 22, 28]
The encoding for document 2  is :  [33, 26, 1, 22, 28, 15, 38, 1, 8]
The encoding for document 3  is :  [2, 41, 40, 46, 10, 28, 32, 31, 26, 8, 28, 10]


Padding des documents : la couche d'embedding de keras nécessite des entrées de la même longueur.

In [None]:
#!python -m spacy download fr_core_news_md

In [None]:
# nlp = spacy.load('fr_core_news_md')

In [1]:
!pip install nltk



In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] Une
[nltk_data]     tentative de connexion a échoué car le parti connecté
[nltk_data]     n’a pas répondu convenablement au-delà d’une certaine
[nltk_data]     durée ou une connexion établie a échoué car l’hôte de
[nltk_data]     connexion n’a pas répondu>


False

In [10]:
# length of maximum document. will be nedded whenever create embeddings for the words
maxlen = -1
for doc in corp:
    tokens = nltk.word_tokenize(doc)
    if(maxlen < len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any document is : ",maxlen)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\qf721b1n/nltk_data'
    - 'C:\\Users\\qf721b1n\\AppData\\Local\\Continuum\\anaconda3\\nltk_data'
    - 'C:\\Users\\qf721b1n\\AppData\\Local\\Continuum\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\qf721b1n\\AppData\\Local\\Continuum\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\qf721b1n\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

In [None]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

## Transformers

In [12]:
import numpy as np
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertConfig
from transformers import CamembertForTokenClassification
from tqdm import tqdm, trange
import tensorflow as tf

In [13]:
sentences, labels = [], []
with open("frwikinews-20130110-pages-articles.txt.tok.stanford-pos", "r", encoding="utf-8") as f:
    for line in f.readlines():
        sentence = []
        sent_tag = []
        tokens = line.replace("\n", "").split(" ")
        for token in tokens:
            splits = token.split("_")
            if len(splits) != 2: continue
            word, tag = splits
            sentence.append(word)
            sent_tag.append(tag)
        sentences.append(" ".join(sentence))
        labels.append(sent_tag)
    f.close()

In [107]:
tags_val = list(set().union(*labels))
tag2idx = {t:i for i,t in enumerate(tags_val)}
# tag2idx["<PAD>"] = len(tag2idx)

In [15]:
lens = np.array(list(map(len, sentences)))
lens.min(), lens.max(), lens.mean()

(21, 1733, 156.07599958838796)

In [73]:
MAX_LEN = 150
batch_size = 64

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
device

device(type='cpu')

In [75]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base/', do_lower_case=True)

In [90]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['▁à', '▁la', '▁suite', '▁de', '▁la', '▁parution', '▁le', '▁matin', '▁même', '▁d', "'", '▁un', '▁article', '▁2', '▁=', '▁le', '▁concernant', '▁dans', '▁le', '▁quotidien', '▁libération', '▁', ',', '▁christ', 'oph', 'e', '▁h', 'onde', 'la', 'tte', '▁décide', '▁de', '▁ne', '▁pas', '▁présenter', '▁le', '▁journal', '▁de', '▁13', '▁h', '▁00', '▁de', '▁france', '▁2', '▁', '.']


In [91]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [108]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PONCT"], padding="post",
                     dtype="long", truncating="post")

In [109]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [118]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [119]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [120]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [121]:
model = CamembertForTokenClassification.from_pretrained("camembert-base/", num_labels=len(tag2idx))

Some weights of the model checkpoint at camembert-base/ were not used when initializing CamembertForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream t

In [122]:
FULL_FINETUNING = True #True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)



In [123]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [124]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [125]:
epochs = 10
max_grad_norm = 1.0
total_t0 = time.time()

for _ in trange(epochs, desc="Epoch"):
    t0 = time.time()
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss, something_else = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)[0]
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_val[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_val[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  Batch    10  of  1,230.    Elapsed: 0:02:31.
  Batch    20  of  1,230.    Elapsed: 0:04:46.
  Batch    30  of  1,230.    Elapsed: 0:07:00.
  Batch    50  of  1,230.    Elapsed: 0:11:32.
  Batch    60  of  1,230.    Elapsed: 0:13:45.
  Batch    70  of  1,230.    Elapsed: 0:15:57.
  Batch    80  of  1,230.    Elapsed: 0:18:15.
  Batch    90  of  1,230.    Elapsed: 0:20:32.
  Batch   100  of  1,230.    Elapsed: 0:22:43.
  Batch   110  of  1,230.    Elapsed: 0:24:55.
  Batch   120  of  1,230.    Elapsed: 0:27:06.
  Batch   130  of  1,230.    Elapsed: 0:29:17.
  Batch   140  of  1,230.    Elapsed: 0:31:28.
  Batch   150  of  1,230.    Elapsed: 0:33:39.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:02.
  Batch   180  of  1,230.    Elapsed: 0:40:14.
  Batch   190  of  1,230.    Elapsed: 0:42:25.
  Batch   200  of  1,230.    Elapsed: 0:44:36.
  Batch   210  of  1,230.    Elapsed: 0:46:47.
  Batch   220  of  1,230.    Elapsed: 0:48:59.
  Batch   230

Epoch:  10%|█         | 1/10 [4:46:16<42:56:31, 17176.87s/it]

F1-Score: 0.6678528399311532
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:26.
  Batch    30  of  1,230.    Elapsed: 0:06:39.
  Batch    40  of  1,230.    Elapsed: 0:08:51.
  Batch    50  of  1,230.    Elapsed: 0:11:03.
  Batch    60  of  1,230.    Elapsed: 0:13:16.
  Batch    70  of  1,230.    Elapsed: 0:15:26.
  Batch    80  of  1,230.    Elapsed: 0:17:39.
  Batch    90  of  1,230.    Elapsed: 0:19:50.
  Batch   100  of  1,230.    Elapsed: 0:22:02.
  Batch   110  of  1,230.    Elapsed: 0:24:14.
  Batch   120  of  1,230.    Elapsed: 0:26:26.
  Batch   130  of  1,230.    Elapsed: 0:28:38.
  Batch   140  of  1,230.    Elapsed: 0:30:50.
  Batch   150  of  1,230.    Elapsed: 0:33:01.
  Batch   160  of  1,230.    Elapsed: 0:35:13.
  Batch   170  of  1,230.    Elapsed: 0:37:25.
  Batch   180  of  1,230.    Elapsed: 0:39:37.
  Batch   190  of  1,230.    Elapsed: 0:41:50.
  Batch   200  of  1,230.    Elapsed: 0:44:02.
  Batch   210  of  1,230.    El

Epoch:  20%|██        | 2/10 [9:32:01<38:08:58, 17167.32s/it]

F1-Score: 0.6456441531748572
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:27.
  Batch    30  of  1,230.    Elapsed: 0:06:40.
  Batch    40  of  1,230.    Elapsed: 0:08:54.
  Batch    50  of  1,230.    Elapsed: 0:11:07.
  Batch    60  of  1,230.    Elapsed: 0:13:22.
  Batch    70  of  1,230.    Elapsed: 0:15:37.
  Batch    80  of  1,230.    Elapsed: 0:17:51.
  Batch    90  of  1,230.    Elapsed: 0:20:06.
  Batch   100  of  1,230.    Elapsed: 0:22:21.
  Batch   110  of  1,230.    Elapsed: 0:24:37.
  Batch   120  of  1,230.    Elapsed: 0:26:50.
  Batch   130  of  1,230.    Elapsed: 0:29:03.
  Batch   140  of  1,230.    Elapsed: 0:31:15.
  Batch   150  of  1,230.    Elapsed: 0:33:28.
  Batch   160  of  1,230.    Elapsed: 0:35:41.
  Batch   170  of  1,230.    Elapsed: 0:37:53.
  Batch   180  of  1,230.    Elapsed: 0:40:07.
  Batch   190  of  1,230.    Elapsed: 0:42:21.
  Batch   200  of  1,230.    Elapsed: 0:44:35.
  Batch   210  of  1,230.    El

Epoch:  30%|███       | 3/10 [14:22:30<33:32:00, 17245.81s/it]

F1-Score: 0.8512631076728391
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:25.
  Batch    30  of  1,230.    Elapsed: 0:06:39.
  Batch    40  of  1,230.    Elapsed: 0:08:53.
  Batch    50  of  1,230.    Elapsed: 0:11:06.
  Batch    60  of  1,230.    Elapsed: 0:13:20.
  Batch    70  of  1,230.    Elapsed: 0:15:36.
  Batch    80  of  1,230.    Elapsed: 0:17:50.
  Batch    90  of  1,230.    Elapsed: 0:20:04.
  Batch   100  of  1,230.    Elapsed: 0:22:18.
  Batch   110  of  1,230.    Elapsed: 0:24:32.
  Batch   120  of  1,230.    Elapsed: 0:26:46.
  Batch   130  of  1,230.    Elapsed: 0:29:02.
  Batch   140  of  1,230.    Elapsed: 0:31:15.
  Batch   150  of  1,230.    Elapsed: 0:33:31.
  Batch   160  of  1,230.    Elapsed: 0:35:45.
  Batch   170  of  1,230.    Elapsed: 0:37:58.
  Batch   180  of  1,230.    Elapsed: 0:40:12.
  Batch   190  of  1,230.    Elapsed: 0:42:26.
  Batch   200  of  1,230.    Elapsed: 0:44:39.
  Batch   210  of  1,230.    El

Epoch:  40%|████      | 4/10 [19:13:06<28:50:16, 17302.71s/it]

F1-Score: 0.8555478903830289
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:29.
  Batch    30  of  1,230.    Elapsed: 0:06:44.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:25.
  Batch    70  of  1,230.    Elapsed: 0:15:39.
  Batch    80  of  1,230.    Elapsed: 0:17:55.
  Batch    90  of  1,230.    Elapsed: 0:20:09.
  Batch   100  of  1,230.    Elapsed: 0:22:24.
  Batch   110  of  1,230.    Elapsed: 0:24:39.
  Batch   120  of  1,230.    Elapsed: 0:26:55.
  Batch   130  of  1,230.    Elapsed: 0:29:09.
  Batch   140  of  1,230.    Elapsed: 0:31:23.
  Batch   150  of  1,230.    Elapsed: 0:33:36.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:05.
  Batch   180  of  1,230.    Elapsed: 0:40:20.
  Batch   190  of  1,230.    Elapsed: 0:42:34.
  Batch   200  of  1,230.    Elapsed: 0:44:48.
  Batch   210  of  1,230.    El

Epoch:  50%|█████     | 5/10 [24:04:10<24:05:56, 17351.27s/it]

F1-Score: 0.8583599313546852
  Batch    10  of  1,230.    Elapsed: 0:02:15.
  Batch    20  of  1,230.    Elapsed: 0:04:28.
  Batch    30  of  1,230.    Elapsed: 0:06:41.
  Batch    40  of  1,230.    Elapsed: 0:08:54.
  Batch    50  of  1,230.    Elapsed: 0:11:08.
  Batch    60  of  1,230.    Elapsed: 0:13:21.
  Batch    70  of  1,230.    Elapsed: 0:15:35.
  Batch    80  of  1,230.    Elapsed: 0:17:51.
  Batch    90  of  1,230.    Elapsed: 0:20:10.
  Batch   100  of  1,230.    Elapsed: 0:22:26.
  Batch   110  of  1,230.    Elapsed: 0:24:42.
  Batch   120  of  1,230.    Elapsed: 0:26:58.
  Batch   130  of  1,230.    Elapsed: 0:29:13.
  Batch   140  of  1,230.    Elapsed: 0:31:28.
  Batch   150  of  1,230.    Elapsed: 0:33:45.
  Batch   160  of  1,230.    Elapsed: 0:36:00.
  Batch   170  of  1,230.    Elapsed: 0:38:16.
  Batch   180  of  1,230.    Elapsed: 0:40:31.
  Batch   190  of  1,230.    Elapsed: 0:42:46.
  Batch   200  of  1,230.    Elapsed: 0:45:02.
  Batch   210  of  1,230.    El

Epoch:  60%|██████    | 6/10 [28:55:13<19:18:59, 17384.76s/it]

F1-Score: 0.8905476457013995
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:28.
  Batch    30  of  1,230.    Elapsed: 0:06:42.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:28.
  Batch    70  of  1,230.    Elapsed: 0:15:43.
  Batch    80  of  1,230.    Elapsed: 0:17:57.
  Batch    90  of  1,230.    Elapsed: 0:20:11.
  Batch   100  of  1,230.    Elapsed: 0:22:25.
  Batch   110  of  1,230.    Elapsed: 0:24:38.
  Batch   120  of  1,230.    Elapsed: 0:26:54.
  Batch   130  of  1,230.    Elapsed: 0:29:09.
  Batch   140  of  1,230.    Elapsed: 0:31:23.
  Batch   150  of  1,230.    Elapsed: 0:33:36.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:05.
  Batch   180  of  1,230.    Elapsed: 0:40:19.
  Batch   190  of  1,230.    Elapsed: 0:42:32.
  Batch   200  of  1,230.    Elapsed: 0:44:46.
  Batch   210  of  1,230.    El

Epoch:  70%|███████   | 7/10 [33:46:10<14:30:19, 17406.42s/it]

F1-Score: 0.8964830655648985
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:26.
  Batch    30  of  1,230.    Elapsed: 0:06:41.
  Batch    40  of  1,230.    Elapsed: 0:08:57.
  Batch    50  of  1,230.    Elapsed: 0:11:11.
  Batch    60  of  1,230.    Elapsed: 0:13:25.
  Batch    70  of  1,230.    Elapsed: 0:15:39.
  Batch    80  of  1,230.    Elapsed: 0:17:52.
  Batch    90  of  1,230.    Elapsed: 0:20:07.
  Batch   100  of  1,230.    Elapsed: 0:22:21.
  Batch   110  of  1,230.    Elapsed: 0:24:36.
  Batch   120  of  1,230.    Elapsed: 0:26:51.
  Batch   130  of  1,230.    Elapsed: 0:29:05.
  Batch   140  of  1,230.    Elapsed: 0:31:18.
  Batch   150  of  1,230.    Elapsed: 0:33:33.
  Batch   160  of  1,230.    Elapsed: 0:35:47.
  Batch   170  of  1,230.    Elapsed: 0:38:01.
  Batch   180  of  1,230.    Elapsed: 0:40:15.
  Batch   190  of  1,230.    Elapsed: 0:42:29.
  Batch   200  of  1,230.    Elapsed: 0:44:44.
  Batch   210  of  1,230.    El

Epoch:  80%|████████  | 8/10 [38:37:05<9:40:41, 17420.93s/it] 

F1-Score: 0.8772330836818727
  Batch    10  of  1,230.    Elapsed: 0:02:16.
  Batch    20  of  1,230.    Elapsed: 0:04:30.
  Batch    30  of  1,230.    Elapsed: 0:06:44.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:27.
  Batch    70  of  1,230.    Elapsed: 0:15:42.
  Batch    80  of  1,230.    Elapsed: 0:17:56.
  Batch    90  of  1,230.    Elapsed: 0:20:10.
  Batch   100  of  1,230.    Elapsed: 0:22:24.
  Batch   110  of  1,230.    Elapsed: 0:24:40.
  Batch   120  of  1,230.    Elapsed: 0:26:55.
  Batch   130  of  1,230.    Elapsed: 0:29:10.
  Batch   140  of  1,230.    Elapsed: 0:31:24.
  Batch   150  of  1,230.    Elapsed: 0:33:40.
  Batch   160  of  1,230.    Elapsed: 0:35:55.
  Batch   170  of  1,230.    Elapsed: 0:38:11.
  Batch   180  of  1,230.    Elapsed: 0:40:26.
  Batch   190  of  1,230.    Elapsed: 0:42:40.
  Batch   200  of  1,230.    Elapsed: 0:44:55.
  Batch   210  of  1,230.    El

Epoch:  90%|█████████ | 9/10 [43:28:13<4:50:34, 17434.96s/it]

F1-Score: 0.8982955602092547
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:27.
  Batch    30  of  1,230.    Elapsed: 0:06:42.
  Batch    40  of  1,230.    Elapsed: 0:08:55.
  Batch    50  of  1,230.    Elapsed: 0:11:10.
  Batch    60  of  1,230.    Elapsed: 0:13:26.
  Batch    70  of  1,230.    Elapsed: 0:15:40.
  Batch    80  of  1,230.    Elapsed: 0:17:54.
  Batch    90  of  1,230.    Elapsed: 0:20:09.
  Batch   100  of  1,230.    Elapsed: 0:22:23.
  Batch   110  of  1,230.    Elapsed: 0:24:39.
  Batch   120  of  1,230.    Elapsed: 0:26:53.
  Batch   130  of  1,230.    Elapsed: 0:29:07.
  Batch   140  of  1,230.    Elapsed: 0:31:21.
  Batch   150  of  1,230.    Elapsed: 0:33:35.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:04.
  Batch   180  of  1,230.    Elapsed: 0:40:18.
  Batch   190  of  1,230.    Elapsed: 0:42:33.
  Batch   200  of  1,230.    Elapsed: 0:44:46.
  Batch   210  of  1,230.    El

Epoch: 100%|██████████| 10/10 [48:20:27<00:00, 17402.76s/it] 

F1-Score: 0.899932572383748





In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)[0]
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)[0]
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
#     eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_val[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_val[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
# print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
import pickle
pickle.dump(model, open('CamemBERT_POS', 'wb'))

# Embeddings de doc

## Doc2Vec

In [12]:
import numpy as np
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertConfig
from transformers import CamembertForTokenClassification
from tqdm import tqdm, trange
import tensorflow as tf

In [13]:
sentences, labels = [], []
with open("frwikinews-20130110-pages-articles.txt.tok.stanford-pos", "r", encoding="utf-8") as f:
    for line in f.readlines():
        sentence = []
        sent_tag = []
        tokens = line.replace("\n", "").split(" ")
        for token in tokens:
            splits = token.split("_")
            if len(splits) != 2: continue
            word, tag = splits
            sentence.append(word)
            sent_tag.append(tag)
        sentences.append(" ".join(sentence))
        labels.append(sent_tag)
    f.close()

In [107]:
tags_val = list(set().union(*labels))
tag2idx = {t:i for i,t in enumerate(tags_val)}
# tag2idx["<PAD>"] = len(tag2idx)

In [15]:
lens = np.array(list(map(len, sentences)))
lens.min(), lens.max(), lens.mean()

(21, 1733, 156.07599958838796)

In [73]:
MAX_LEN = 150
batch_size = 64

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
device

device(type='cpu')

In [75]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base/', do_lower_case=True)

In [90]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

['▁à', '▁la', '▁suite', '▁de', '▁la', '▁parution', '▁le', '▁matin', '▁même', '▁d', "'", '▁un', '▁article', '▁2', '▁=', '▁le', '▁concernant', '▁dans', '▁le', '▁quotidien', '▁libération', '▁', ',', '▁christ', 'oph', 'e', '▁h', 'onde', 'la', 'tte', '▁décide', '▁de', '▁ne', '▁pas', '▁présenter', '▁le', '▁journal', '▁de', '▁13', '▁h', '▁00', '▁de', '▁france', '▁2', '▁', '.']


In [91]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [108]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PONCT"], padding="post",
                     dtype="long", truncating="post")

In [109]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [118]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [119]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [120]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [121]:
model = CamembertForTokenClassification.from_pretrained("camembert-base/", num_labels=len(tag2idx))

Some weights of the model checkpoint at camembert-base/ were not used when initializing CamembertForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream t

In [122]:
FULL_FINETUNING = True #True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)



In [123]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [124]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [125]:
epochs = 10
max_grad_norm = 1.0
total_t0 = time.time()

for _ in trange(epochs, desc="Epoch"):
    t0 = time.time()
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss, something_else = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)[0]
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_val[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_val[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

  Batch    10  of  1,230.    Elapsed: 0:02:31.
  Batch    20  of  1,230.    Elapsed: 0:04:46.
  Batch    30  of  1,230.    Elapsed: 0:07:00.
  Batch    50  of  1,230.    Elapsed: 0:11:32.
  Batch    60  of  1,230.    Elapsed: 0:13:45.
  Batch    70  of  1,230.    Elapsed: 0:15:57.
  Batch    80  of  1,230.    Elapsed: 0:18:15.
  Batch    90  of  1,230.    Elapsed: 0:20:32.
  Batch   100  of  1,230.    Elapsed: 0:22:43.
  Batch   110  of  1,230.    Elapsed: 0:24:55.
  Batch   120  of  1,230.    Elapsed: 0:27:06.
  Batch   130  of  1,230.    Elapsed: 0:29:17.
  Batch   140  of  1,230.    Elapsed: 0:31:28.
  Batch   150  of  1,230.    Elapsed: 0:33:39.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:02.
  Batch   180  of  1,230.    Elapsed: 0:40:14.
  Batch   190  of  1,230.    Elapsed: 0:42:25.
  Batch   200  of  1,230.    Elapsed: 0:44:36.
  Batch   210  of  1,230.    Elapsed: 0:46:47.
  Batch   220  of  1,230.    Elapsed: 0:48:59.
  Batch   230

Epoch:  10%|█         | 1/10 [4:46:16<42:56:31, 17176.87s/it]

F1-Score: 0.6678528399311532
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:26.
  Batch    30  of  1,230.    Elapsed: 0:06:39.
  Batch    40  of  1,230.    Elapsed: 0:08:51.
  Batch    50  of  1,230.    Elapsed: 0:11:03.
  Batch    60  of  1,230.    Elapsed: 0:13:16.
  Batch    70  of  1,230.    Elapsed: 0:15:26.
  Batch    80  of  1,230.    Elapsed: 0:17:39.
  Batch    90  of  1,230.    Elapsed: 0:19:50.
  Batch   100  of  1,230.    Elapsed: 0:22:02.
  Batch   110  of  1,230.    Elapsed: 0:24:14.
  Batch   120  of  1,230.    Elapsed: 0:26:26.
  Batch   130  of  1,230.    Elapsed: 0:28:38.
  Batch   140  of  1,230.    Elapsed: 0:30:50.
  Batch   150  of  1,230.    Elapsed: 0:33:01.
  Batch   160  of  1,230.    Elapsed: 0:35:13.
  Batch   170  of  1,230.    Elapsed: 0:37:25.
  Batch   180  of  1,230.    Elapsed: 0:39:37.
  Batch   190  of  1,230.    Elapsed: 0:41:50.
  Batch   200  of  1,230.    Elapsed: 0:44:02.
  Batch   210  of  1,230.    El

Epoch:  20%|██        | 2/10 [9:32:01<38:08:58, 17167.32s/it]

F1-Score: 0.6456441531748572
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:27.
  Batch    30  of  1,230.    Elapsed: 0:06:40.
  Batch    40  of  1,230.    Elapsed: 0:08:54.
  Batch    50  of  1,230.    Elapsed: 0:11:07.
  Batch    60  of  1,230.    Elapsed: 0:13:22.
  Batch    70  of  1,230.    Elapsed: 0:15:37.
  Batch    80  of  1,230.    Elapsed: 0:17:51.
  Batch    90  of  1,230.    Elapsed: 0:20:06.
  Batch   100  of  1,230.    Elapsed: 0:22:21.
  Batch   110  of  1,230.    Elapsed: 0:24:37.
  Batch   120  of  1,230.    Elapsed: 0:26:50.
  Batch   130  of  1,230.    Elapsed: 0:29:03.
  Batch   140  of  1,230.    Elapsed: 0:31:15.
  Batch   150  of  1,230.    Elapsed: 0:33:28.
  Batch   160  of  1,230.    Elapsed: 0:35:41.
  Batch   170  of  1,230.    Elapsed: 0:37:53.
  Batch   180  of  1,230.    Elapsed: 0:40:07.
  Batch   190  of  1,230.    Elapsed: 0:42:21.
  Batch   200  of  1,230.    Elapsed: 0:44:35.
  Batch   210  of  1,230.    El

Epoch:  30%|███       | 3/10 [14:22:30<33:32:00, 17245.81s/it]

F1-Score: 0.8512631076728391
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:25.
  Batch    30  of  1,230.    Elapsed: 0:06:39.
  Batch    40  of  1,230.    Elapsed: 0:08:53.
  Batch    50  of  1,230.    Elapsed: 0:11:06.
  Batch    60  of  1,230.    Elapsed: 0:13:20.
  Batch    70  of  1,230.    Elapsed: 0:15:36.
  Batch    80  of  1,230.    Elapsed: 0:17:50.
  Batch    90  of  1,230.    Elapsed: 0:20:04.
  Batch   100  of  1,230.    Elapsed: 0:22:18.
  Batch   110  of  1,230.    Elapsed: 0:24:32.
  Batch   120  of  1,230.    Elapsed: 0:26:46.
  Batch   130  of  1,230.    Elapsed: 0:29:02.
  Batch   140  of  1,230.    Elapsed: 0:31:15.
  Batch   150  of  1,230.    Elapsed: 0:33:31.
  Batch   160  of  1,230.    Elapsed: 0:35:45.
  Batch   170  of  1,230.    Elapsed: 0:37:58.
  Batch   180  of  1,230.    Elapsed: 0:40:12.
  Batch   190  of  1,230.    Elapsed: 0:42:26.
  Batch   200  of  1,230.    Elapsed: 0:44:39.
  Batch   210  of  1,230.    El

Epoch:  40%|████      | 4/10 [19:13:06<28:50:16, 17302.71s/it]

F1-Score: 0.8555478903830289
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:29.
  Batch    30  of  1,230.    Elapsed: 0:06:44.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:25.
  Batch    70  of  1,230.    Elapsed: 0:15:39.
  Batch    80  of  1,230.    Elapsed: 0:17:55.
  Batch    90  of  1,230.    Elapsed: 0:20:09.
  Batch   100  of  1,230.    Elapsed: 0:22:24.
  Batch   110  of  1,230.    Elapsed: 0:24:39.
  Batch   120  of  1,230.    Elapsed: 0:26:55.
  Batch   130  of  1,230.    Elapsed: 0:29:09.
  Batch   140  of  1,230.    Elapsed: 0:31:23.
  Batch   150  of  1,230.    Elapsed: 0:33:36.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:05.
  Batch   180  of  1,230.    Elapsed: 0:40:20.
  Batch   190  of  1,230.    Elapsed: 0:42:34.
  Batch   200  of  1,230.    Elapsed: 0:44:48.
  Batch   210  of  1,230.    El

Epoch:  50%|█████     | 5/10 [24:04:10<24:05:56, 17351.27s/it]

F1-Score: 0.8583599313546852
  Batch    10  of  1,230.    Elapsed: 0:02:15.
  Batch    20  of  1,230.    Elapsed: 0:04:28.
  Batch    30  of  1,230.    Elapsed: 0:06:41.
  Batch    40  of  1,230.    Elapsed: 0:08:54.
  Batch    50  of  1,230.    Elapsed: 0:11:08.
  Batch    60  of  1,230.    Elapsed: 0:13:21.
  Batch    70  of  1,230.    Elapsed: 0:15:35.
  Batch    80  of  1,230.    Elapsed: 0:17:51.
  Batch    90  of  1,230.    Elapsed: 0:20:10.
  Batch   100  of  1,230.    Elapsed: 0:22:26.
  Batch   110  of  1,230.    Elapsed: 0:24:42.
  Batch   120  of  1,230.    Elapsed: 0:26:58.
  Batch   130  of  1,230.    Elapsed: 0:29:13.
  Batch   140  of  1,230.    Elapsed: 0:31:28.
  Batch   150  of  1,230.    Elapsed: 0:33:45.
  Batch   160  of  1,230.    Elapsed: 0:36:00.
  Batch   170  of  1,230.    Elapsed: 0:38:16.
  Batch   180  of  1,230.    Elapsed: 0:40:31.
  Batch   190  of  1,230.    Elapsed: 0:42:46.
  Batch   200  of  1,230.    Elapsed: 0:45:02.
  Batch   210  of  1,230.    El

Epoch:  60%|██████    | 6/10 [28:55:13<19:18:59, 17384.76s/it]

F1-Score: 0.8905476457013995
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:28.
  Batch    30  of  1,230.    Elapsed: 0:06:42.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:28.
  Batch    70  of  1,230.    Elapsed: 0:15:43.
  Batch    80  of  1,230.    Elapsed: 0:17:57.
  Batch    90  of  1,230.    Elapsed: 0:20:11.
  Batch   100  of  1,230.    Elapsed: 0:22:25.
  Batch   110  of  1,230.    Elapsed: 0:24:38.
  Batch   120  of  1,230.    Elapsed: 0:26:54.
  Batch   130  of  1,230.    Elapsed: 0:29:09.
  Batch   140  of  1,230.    Elapsed: 0:31:23.
  Batch   150  of  1,230.    Elapsed: 0:33:36.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:05.
  Batch   180  of  1,230.    Elapsed: 0:40:19.
  Batch   190  of  1,230.    Elapsed: 0:42:32.
  Batch   200  of  1,230.    Elapsed: 0:44:46.
  Batch   210  of  1,230.    El

Epoch:  70%|███████   | 7/10 [33:46:10<14:30:19, 17406.42s/it]

F1-Score: 0.8964830655648985
  Batch    10  of  1,230.    Elapsed: 0:02:13.
  Batch    20  of  1,230.    Elapsed: 0:04:26.
  Batch    30  of  1,230.    Elapsed: 0:06:41.
  Batch    40  of  1,230.    Elapsed: 0:08:57.
  Batch    50  of  1,230.    Elapsed: 0:11:11.
  Batch    60  of  1,230.    Elapsed: 0:13:25.
  Batch    70  of  1,230.    Elapsed: 0:15:39.
  Batch    80  of  1,230.    Elapsed: 0:17:52.
  Batch    90  of  1,230.    Elapsed: 0:20:07.
  Batch   100  of  1,230.    Elapsed: 0:22:21.
  Batch   110  of  1,230.    Elapsed: 0:24:36.
  Batch   120  of  1,230.    Elapsed: 0:26:51.
  Batch   130  of  1,230.    Elapsed: 0:29:05.
  Batch   140  of  1,230.    Elapsed: 0:31:18.
  Batch   150  of  1,230.    Elapsed: 0:33:33.
  Batch   160  of  1,230.    Elapsed: 0:35:47.
  Batch   170  of  1,230.    Elapsed: 0:38:01.
  Batch   180  of  1,230.    Elapsed: 0:40:15.
  Batch   190  of  1,230.    Elapsed: 0:42:29.
  Batch   200  of  1,230.    Elapsed: 0:44:44.
  Batch   210  of  1,230.    El

Epoch:  80%|████████  | 8/10 [38:37:05<9:40:41, 17420.93s/it] 

F1-Score: 0.8772330836818727
  Batch    10  of  1,230.    Elapsed: 0:02:16.
  Batch    20  of  1,230.    Elapsed: 0:04:30.
  Batch    30  of  1,230.    Elapsed: 0:06:44.
  Batch    40  of  1,230.    Elapsed: 0:08:58.
  Batch    50  of  1,230.    Elapsed: 0:11:13.
  Batch    60  of  1,230.    Elapsed: 0:13:27.
  Batch    70  of  1,230.    Elapsed: 0:15:42.
  Batch    80  of  1,230.    Elapsed: 0:17:56.
  Batch    90  of  1,230.    Elapsed: 0:20:10.
  Batch   100  of  1,230.    Elapsed: 0:22:24.
  Batch   110  of  1,230.    Elapsed: 0:24:40.
  Batch   120  of  1,230.    Elapsed: 0:26:55.
  Batch   130  of  1,230.    Elapsed: 0:29:10.
  Batch   140  of  1,230.    Elapsed: 0:31:24.
  Batch   150  of  1,230.    Elapsed: 0:33:40.
  Batch   160  of  1,230.    Elapsed: 0:35:55.
  Batch   170  of  1,230.    Elapsed: 0:38:11.
  Batch   180  of  1,230.    Elapsed: 0:40:26.
  Batch   190  of  1,230.    Elapsed: 0:42:40.
  Batch   200  of  1,230.    Elapsed: 0:44:55.
  Batch   210  of  1,230.    El

Epoch:  90%|█████████ | 9/10 [43:28:13<4:50:34, 17434.96s/it]

F1-Score: 0.8982955602092547
  Batch    10  of  1,230.    Elapsed: 0:02:14.
  Batch    20  of  1,230.    Elapsed: 0:04:27.
  Batch    30  of  1,230.    Elapsed: 0:06:42.
  Batch    40  of  1,230.    Elapsed: 0:08:55.
  Batch    50  of  1,230.    Elapsed: 0:11:10.
  Batch    60  of  1,230.    Elapsed: 0:13:26.
  Batch    70  of  1,230.    Elapsed: 0:15:40.
  Batch    80  of  1,230.    Elapsed: 0:17:54.
  Batch    90  of  1,230.    Elapsed: 0:20:09.
  Batch   100  of  1,230.    Elapsed: 0:22:23.
  Batch   110  of  1,230.    Elapsed: 0:24:39.
  Batch   120  of  1,230.    Elapsed: 0:26:53.
  Batch   130  of  1,230.    Elapsed: 0:29:07.
  Batch   140  of  1,230.    Elapsed: 0:31:21.
  Batch   150  of  1,230.    Elapsed: 0:33:35.
  Batch   160  of  1,230.    Elapsed: 0:35:50.
  Batch   170  of  1,230.    Elapsed: 0:38:04.
  Batch   180  of  1,230.    Elapsed: 0:40:18.
  Batch   190  of  1,230.    Elapsed: 0:42:33.
  Batch   200  of  1,230.    Elapsed: 0:44:46.
  Batch   210  of  1,230.    El

Epoch: 100%|██████████| 10/10 [48:20:27<00:00, 17402.76s/it] 

F1-Score: 0.899932572383748





In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)[0]
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)[0]
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
#     eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_val[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_val[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
# print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
import pickle
pickle.dump(model, open('CamemBERT_POS', 'wb'))

## LDA Model

Introduces Gensim's LDA model and demonstrates its use on the NIPS corpus.

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import io
import os.path
import re
import tarfile
import nltk

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    fname = url.split('/')[-1]
    
    # Download the file to local storage first.
    # We can't read it on the fly because of 
    # https://github.com/RaRe-Technologies/smart_open/issues/331
    if not os.path.isfile(fname):
        with smart_open.open(url, "rb") as fin:
            with smart_open.open(fname, 'wb') as fout:
                while True:
                    buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                    if not buf:
                        break
                    fout.write(buf)
                         
    with tarfile.open(fname, mode='r:gz') as tar:
        # Ignore directory entries, as well as files like README, etc.
        files = [
            m for m in tar.getmembers()
            if m.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', m.name)
        ]
        for member in sorted(files, key=lambda x: x.name):
            member_bytes = tar.extractfile(member).read()
            yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

So we have a list of 1740 documents, where each document is a Unicode string. 
If you're thinking about using your own corpus, then you need to make sure
that it's in the same format (list of Unicode strings) before proceeding
with the rest of this tutorial.




In [None]:
print(len(docs))
print(docs[0][:500])

1740
1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a pr


In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a
stemmer in this case because it produces more readable words. Output that is
easy to read is very desirable in topic modelling.




In [None]:
# Lemmatize the documents.
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


We find bigrams in the documents. Bigrams are sets of two adjacent words.
Using bigrams we can get phrases like "machine_learning" in our output
(spaces are replaced with underscores); without bigrams we would only get
"machine" and "learning".

Note that in the code below, we find bigrams and then add them to the
original data, because we would like to keep the words "machine" and
"learning" as well as the bigram "machine_learning".

.. Important::
    Computing n-grams of large dataset can be very computationally
    and memory intensive.




In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2020-09-23 11:26:47,213 : INFO : 'pattern' package not found; tag filters are not available for English
2020-09-23 11:26:47,230 : INFO : collecting all words and their counts
2020-09-23 11:26:47,232 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-09-23 11:27:28,531 : INFO : collected 8598 word types from a corpus of 33052479 words (unigram + bigrams) and 1740 sentences
2020-09-23 11:27:28,532 : INFO : using 8598 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


We remove rare words and common words based on their *document frequency*.
Below we remove words that appear in less than 20 documents or in more than
50% of the documents. Consider trying to remove words only based on their
frequency, or maybe combining that with this approach.




In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

2020-09-23 11:29:23,774 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-23 11:29:28,855 : INFO : built Dictionary(98 unique tokens: ['\n', ' ', '"', '&', "'"]...) from 1740 documents (total 33061618 corpus positions)
2020-09-23 11:29:28,856 : INFO : discarding 89 tokens: [('\n', 1740), (' ', 1740), ('"', 1589), ('&', 1012), ("'", 1727), ('(', 1740), (')', 1740), ('+', 1506), (',', 1740), ('-', 1739)]...
2020-09-23 11:29:28,857 : INFO : keeping 9 tokens which were in no less than 20 and no more than 870 (=50.0%) documents
2020-09-23 11:29:28,857 : INFO : resulting dictionary: Dictionary(9 unique tokens: ['@', '^', '!', '$', '\\']...)


Finally, we transform the documents to a vectorized form. We simply compute
the frequency of each word, including the bigrams.




In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

Let's see how many tokens and documents we have to train on.




In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 9
Number of documents: 1740


In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2020-09-23 11:29:33,872 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2020-09-23 11:29:33,875 : INFO : using serial LDA version on this node
2020-09-23 11:29:33,876 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2020-09-23 11:29:33,877 : INFO : PROGRESS: pass 0, at document #1740/1740
2020-09-23 11:29:34,461 : INFO : optimized alpha [0.07986111, 0.086652175, 0.083130755, 0.09890915, 0.124905825, 0.083698735, 0.08218231, 0.08168751, 0.07862009, 0.078591436]
2020-09-23 11:29:34,462 : INFO : topic #9 (0.079): 0.609*"^" + 0.089*"@" + 0.073*"Q" + 0.073*"#" + 0.049*"$" + 0.041*"!" + 0.033*"\" + 0.017*"~" + 0.017*"`"
2020-09-23 11:29:34,466 : INFO : topic #8 (0.079): 0.625*"#" + 0.192*"!" + 0.086*"Q" + 0.036*"@" + 0.029*"$" + 0.029*"^" + 

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2020-09-23 11:29:40,294 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -2.1677.
[([(0.6268966, '~'),
   (0.27164382, '`'),
   (0.09518634, '$'),
   (0.0018629064, '@'),
   (0.0016628734, '!'),
   (0.0009513289, 'Q'),
   (0.0009442948, '#'),
   (0.0006141935, '\\'),
   (0.0002376064, '^')],
  -1.6473896068873706),
 ([(0.9857895, '!'),
   (0.010243523, '$'),
   (0.0015068037, '\\'),
   (0.0013442617, '`'),
   (0.00069610035, 'Q'),
   (0.00014837748, '@'),
   (0.00013696036, '#'),
   (7.871825e-05, '^'),
   (5.576831e-05, '~')],
  -1.944851946168662),
 ([(0.98330337, '^'),
   (0.007277961, 'Q'),
   (0.0028545922, '!'),
   (0.002443288, '$'),
   (0.0020506608, '@'),
   (0.00094967434, '#'),
   (0.0005177544, '\\'),
   (0.0003117447, '~'),
   (0.0002909759, '`')],
  -2.1400889916602437),
 ([(0.91968215, '\\'),
   (0.032944962, '!'),
   (0.031398878, '@'),
   (0.01228923, '$'),
   (0.0023316042, 'Q'),
   (0.0005094729, '#'),
   (0.0003296285, '^'),
   (0.00028613838, '~'),
   (0.00022803883, '`')],
  -2.1725458691945105),
 ([(0.9731228,

## Word Movers' Distance

Demonstrates using Gensim's implemenation of the WMD.

In [None]:
# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'

These sentences have very similar content, and as such the WMD should be low.
Before we compute the WMD, we want to remove stopwords ("the", "to", etc.),
as these do not contribute a lot to the information in the sentences.




In [None]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Now, as mentioned earlier, we will be using some downloaded pre-trained
embeddings. We load these into a Gensim Word2Vec model class.

.. Important::
  The embeddings we have chosen here require a lot of memory.




In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

2020-09-23 11:22:21,731 : INFO : 'pattern' package not found; tag filters are not available for English
2020-09-23 11:22:21,823 : INFO : Creating /root/gensim-data




So let's compute WMD using the ``wmdistance`` method.




In [None]:
distance = model.wmdistance(sentence_obama, sentence_president)
print('distance = %.4f' % distance)

2020-09-23 11:28:25,225 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-23 11:28:25,226 : INFO : built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions)


distance = 3.3741


Let's try the same thing with two completely unrelated sentences. Notice that the distance is larger.




In [None]:
sentence_orange = preprocess('Oranges are my favorite fruit')
distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

2020-09-23 11:28:25,236 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-23 11:28:25,237 : INFO : built Dictionary(7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...) from 2 documents (total 7 corpus positions)


distance = 4.3802


In [None]:
model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

distance = model.wmdistance(sentence_obama, sentence_president)  # Compute WMD as normal.
print('distance: %r' % distance)

distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

2020-09-23 11:28:25,247 : INFO : precomputing L2-norms of word weight vectors
2020-09-23 11:28:49,398 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-23 11:28:49,400 : INFO : built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions)
2020-09-23 11:28:49,402 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-23 11:28:49,403 : INFO : built Dictionary(7 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'favorite']...) from 2 documents (total 7 corpus positions)


distance: 1.0174646259300113
distance = 1.3663


In [None]:

Word Movers' Distance
=====================

Demonstrates using Gensim's implemenation of the WMD.




# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'

These sentences have very similar content, and as such the WMD should be low.
Before we compute the WMD, we want to remove stopwords ("the", "to", etc.),
as these do not contribute a lot to the information in the sentences.




# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)

Now, as mentioned earlier, we will be using some downloaded pre-trained
embeddings. We load these into a Gensim Word2Vec model class.

.. Important::
  The embeddings we have chosen here require a lot of memory.




import gensim.downloader as api
model = api.load('word2vec-google-news-300')

So let's compute WMD using the ``wmdistance`` method.




distance = model.wmdistance(sentence_obama, sentence_president)
print('distance = %.4f' % distance)

Let's try the same thing with two completely unrelated sentences. Notice that the distance is larger.




sentence_orange = preprocess('Oranges are my favorite fruit')
distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

distance = model.wmdistance(sentence_obama, sentence_president)  # Compute WMD as normal.
print('distance: %r' % distance)

distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

## DistillBERT sentiment analysis

In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import urllib.request
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

In [None]:
print(torch.__version__)

1.5.1+cu101


In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [None]:
batch_1 = df

In [None]:
pd.set_option('display.max_colwidth',None)

In [None]:
batch_1.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re imagining of beauty and the beast and 1930s horror films",1
1,apparently reassembled from the cutting room floor of any given daytime soap,0
2,"they presume their audience wo n't sit still for a sociology lesson , however entertainingly presented , so they trot out the conventional science fiction elements of bug eyed monsters and futuristic women in skimpy clothes",0
3,"this is a visually stunning rumination on love , memory , history and the war between art and commerce",1
4,jonathan parker 's bartleby should have been the be all end all of the modern office anomie films,1


In [None]:
batch_1[1].value_counts()

1    3610
0    3310
Name: 1, dtype: int64

In [None]:
# DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Bert
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# chargement modèle et tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Tokenization de nos phrases pour distillBERT
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Padding afin que chaque phrase fasse la même taille

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(6920, 67)

In [None]:
# On cache ce padding
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6920, 67)

In [None]:
# On applique le modèle sur nos token avec le masque
model.eval()
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
np.shape(last_hidden_states[0])

torch.Size([6920, 67, 768])

In [None]:
# Chaque token obtient un vecteur, ici, seul le token special 'cls' en position 1 nous intéresse
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1[1]

### Suite

#### Entrainnement d'une reg log

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=11)

In [None]:
lr_clf = LogisticRegression(C=1)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
lr_clf.score(test_features, test_labels)

0.8601156069364162

# Comparaison des modèles

In [11]:
# ...