# TF-IDF

In [None]:
!pip install spacy
!pip install numpy
!pip install pandas
!pip install spacy
!pip install sklearn

In [None]:
import spacy
import numpy as np
import pandas as pd
from spacy.lang.fr import French
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
texteNLP = """Le traitement automatique du langage naturel (abr. TALN), ou traitement automatique de la langue naturelle, 
 ou encore traitement automatique des langues (abr. TAL) est un domaine multidisciplinaire impliquant la linguistique,
 l'informatique et l'intelligence artificielle, qui vise à créer des outils de traitement de la langue naturelle pour diverses applications.
 Il ne doit pas être confondu avec la linguistique informatique, qui vise à comprendre les langues au moyen d'outils informatiques.
 Le TALN est sorti des laboratoires de recherche pour être progressivement mis en œuvre dans des applications informatiques nécessitant
 l'intégration du langage humain à la machine. Aussi le TALN est-il parfois appelé ingénierie linguistique.
 En France, le traitement automatique de la langue naturelle a sa revue, Traitement automatique des langues,
 publiée par l’Association pour le traitement automatique des langues (ATALA)."""

texteDataScience = """En termes généraux, la science des données est l'extraction de connaissance d'ensembles de données.
 Elle emploie des techniques et des théories tirées de plusieurs autres domaines plus larges des mathématiques, analyse,
 optimisation et statistique principalement, la théorie de l'information et la technologie de l'information, notamment le traitement de signal,
 des modèles probabilistes, l'apprentissage automatique, l'apprentissage statistique, la programmation informatique, l'ingénierie de données,
 la reconnaissance de formes et l'apprentissage, la visualisation, l'analytique prophétique, la modélisation d'incertitude, le stockage de données,
 la géo-visualisation, la compression de données et le calcul à haute performance.
 Les méthodes qui s'adaptent aux données de masse sont particulièrement intéressantes dans la science des données,
 bien que la discipline ne soit généralement pas considérée comme limitée à ces données.

La science des données (en anglais data science) est une discipline qui s'appuie sur des outils mathématiques, de statistiques,
 d'informatique (cette science est principalement une « science des données numériques ») et de visualisation des données.
 Elle est en plein développement, dans le monde universitaire ainsi que dans le secteur privé et le secteur public.
 Moore en 1991 a défini la statistique comme la science des données6 (définition reprise par d'autres dont James T. McClave et al. en 1997)
 et U. Beck en 2001 oppose la science des données à la science de l'expérience, voyant une dissociation croissante entre ces deux types de science,
 que tendrait selon lui à encourager une société de la gestion du risque au sein d'une « civilisation du danger »."""

texteEconometrie = """L'économétrie est une branche de la science économique qui a pour objectif d'estimer et de tester les modèles économiques.
 L'économétrie en tant que discipline naît dans les années 1930 avec la création de la société d'économétrie par Irving Fisher et Ragnar Frisch (1930)
 et la création de la revue Econometrica (1933). Depuis lors, l'économétrie n'a cessé de se développer et de prendre une importance croissante
 au sein de la science économique.

L'économétrie théorique se focalise essentiellement sur deux questions, l'identification et l'estimation statistique. L'économétrie appliquée
 utilise les méthodes économétriques pour comprendre des domaines de l'économie comme l'analyse du marché du travail,
 l'économie de l'éducation ou encore tester la pertinence empirique des modèles de croissance.

L'économétrie appliquée utilise aussi bien des données issues d'un protocole expérimental, que ce soit une expérience de laboratoire
 ou une expérience de terrain, que des données issues directement de l'observation du réel sans manipulation du chercheur.
 Lorsque l'économètre utilise des données issues directement de l'observation du réel, il est fréquent d'identifier des expériences naturelles
 pour retrouver une situation quasi-expérimentale. On parle parfois de révolution de crédibilité, terme controversé, pour désigner l'essor fulgurant
 de ces méthodes de recherche dans la discipline, et en économie en général."""

texteHistoire = """L’histoire, souvent écrit avec la première lettre majuscule,
 est à la fois l’étude et l'écriture des faits et des événements passés quelles que soient leur variété et leur complexité.
 L'histoire est également une science humaine et sociale. On désigne aussi couramment sous le terme d’histoire (par synecdoque) le passé lui-même,
 comme dans les leçons de l'histoire. L'histoire est un récit écrit par lequel des hommes et des femmes (les historiens et historiennes)
 s'efforcent de faire connaître les temps révolus. Ces tentatives ne sont jamais entièrement indépendantes de conditionnements étrangers au domaine
 telle que la vision du monde de leur auteur ou de sa culture, mais elles sont censées être élaborées à partir de sources plutôt que guidées
 par la spéculation ou l'idéologie.

Au cours des siècles, les historiens ont façonné leurs méthodes ainsi que les champs d'intervention, tout en réévaluant leurs sources,
 leur origine et leur exploitation. La discipline universitaire d'étude et écriture de l'histoire, y comprise la critique des méthodes,
 est l'historiographie. Elle s'appuie sur diverses sciences auxiliaires complétant selon les travaux menés la compétence générale de l'historien.
 Elle reste malgré tout une construction humaine, inévitablement inscrite dans son époque, susceptible d'être utilisée en dehors de son domaine,
 notamment à des fins d'ordre politique. 
"""


In [None]:
nlp = French()

documentNLP = nlp(texteNLP)
documentDataScience = nlp(texteDataScience)
documentEconometrie = nlp(texteEconometrie)
documentHistoire = nlp(texteHistoire)

bagOfWordsNLP = []
bagOfWordsDataScience = []
bagOfWordsEconometrie = []
bagOfWordsHistoire = []

for token in documentNLP:
    bagOfWordsNLP.append(token.text)

for token in documentDataScience:
    bagOfWordsDataScience.append(token.text)

for token in documentEconometrie:
    bagOfWordsEconometrie.append(token.text)

for token in documentHistoire:
    bagOfWordsHistoire.append(token.text)

In [None]:
uniqueWords = set(bagOfWordsNLP).union(set(bagOfWordsDataScience)).union(set(bagOfWordsEconometrie)).union(set(bagOfWordsHistoire))

In [None]:
numOfWordsNLP = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsNLP:
    numOfWordsNLP[word] += 1

numOfWordsDataScience = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsDataScience:
    numOfWordsDataScience[word] += 1

numOfWordsEconometrie = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsEconometrie:
    numOfWordsEconometrie[word] += 1

numOfWordsHistoire = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsHistoire:
    numOfWordsHistoire[word] += 1

In [None]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [None]:
tfNLP = computeTF(numOfWordsNLP, bagOfWordsNLP)
tfDataScience = computeTF(numOfWordsDataScience, bagOfWordsDataScience)
tfEconometrie = computeTF(numOfWordsEconometrie, bagOfWordsEconometrie)
tfHistoire = computeTF(numOfWordsHistoire, bagOfWordsHistoire)

In [None]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [None]:
idfs = computeIDF([numOfWordsNLP, numOfWordsDataScience, numOfWordsEconometrie, numOfWordsHistoire])

In [None]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [None]:
tfidfNLP = computeTFIDF(tfNLP, idfs)
tfidfDataScience = computeTFIDF(tfDataScience, idfs)
tfidfEconometrie = computeTFIDF(tfEconometrie, idfs)
tfidfHistoire = computeTFIDF(tfHistoire, idfs)
df = pd.DataFrame([tfidfNLP, tfidfDataScience, tfidfEconometrie, tfidfHistoire])

In [None]:
print(df)

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([texteNLP, texteDataScience, texteEconometrie, texteHistoire])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [None]:
print(df)

In [None]:
df.loc[0]

In [None]:
dist_NLP_DataScience = np.linalg.norm(df.loc[0] - df.loc[1])
dist_NLP_Econometrie = np.linalg.norm(df.loc[0] - df.loc[2])
dist_NLP_Histoire = np.linalg.norm(df.loc[0] - df.loc[3])
dist_DataScience_Econometrie = np.linalg.norm(df.loc[1] - df.loc[2])
dist_DataScience_Histoire = np.linalg.norm(df.loc[1] - df.loc[3])
dist_Econometrie_Histoire = np.linalg.norm(df.loc[2] - df.loc[3])

In [None]:
print(dist_NLP_DataScience)
print(dist_NLP_Econometrie)
print(dist_NLP_Histoire)
print(dist_DataScience_Econometrie)
print(dist_DataScience_Histoire)
print(dist_Econometrie_Histoire)

# Embeddings de mots

## Word2Vec

In [None]:
!pip install gensim

In [None]:
import gensim 
from gensim.models import Word2Vec 

In [None]:
texte1 = """L’exploration de données, connue aussi sous l'expression de fouille de données, forage de données, prospection de données, data mining,
 ou encore extraction de connaissances à partir de données, a pour objet l’extraction d'un savoir ou d'une connaissance à partir de grandes quantités
 de données, par des méthodes automatiques ou semi-automatiques.
Elle se propose d'utiliser un ensemble d'algorithmes issus de disciplines scientifiques diverses telles que les statistiques,
 l'intelligence artificielle ou l'informatique, pour construire des modèles à partir des données,
 c'est-à-dire trouver des structures intéressantes ou des motifs selon des critères fixés au préalable,
 et d'en extraire un maximum de connaissances.
L'utilisation industrielle ou opérationnelle de ce savoir dans le monde professionnel permet de résoudre des problèmes très divers,
 allant de la gestion de la relation client à la maintenance préventive, en passant par la détection de fraudes ou encore l'optimisation de sites web.
C'est aussi le mode de travail du journalisme de données.
L'exploration de données fait suite, dans l'escalade de l'exploitation des données de l'entreprise, à l'informatique décisionnelle.
Celle-ci permet de constater un fait, tel que le chiffre d'affaires, et de l'expliquer comme le chiffre d'affaires décliné par produits,
 tandis que l'exploration de données permet de classer les faits et de les prévoir dans une certaine mesure ou encore de les éclairer en révélant
 par exemple les variables ou paramètres qui pourraient faire comprendre pourquoi le chiffre d'affaires de tel point de vente est supérieur
 à celui de tel autre. """

texte2 = """En statistique, les analyses multivariées ont pour caractéristique de s'intéresser à des lois de probabilité à plusieurs variables.
Les analyses bivariées sont des cas particuliers à deux variables.
Les analyses multivariées sont très diverses selon l'objectif recherché, la nature des variables et la mise en œuvre formelle.
On peut identifier deux grandes familles : celle des méthodes descriptives (visant à structurer et résumer l'information)
 et celle des méthodes explicatives visant à expliquer une ou des variables dites « dépendantes » (variables à expliquer) par un ensemble de variables
 dites « indépendantes » (variables explicatives).
Les méthodes appelées en français analyse des données en sont un sous-ensemble. """

In [None]:
texte = nlp(texte1 + texte2)

In [None]:
data = []

# iterate through each sentence in the file 
for sent in texte: 
    temp = []
      
    # tokenize the sentence into words 
    for token in texte: 
        temp.append(token.text.lower()) 
  
    data.append(temp)

In [None]:
print(data)

### Continuous bag of words (CBOW)

Le modèle CBOW prédit le mot courant étant donné les mots de contexte dans une fenêtre autour du mot courant.

In [None]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5,
                              sg = 0)

# Print results
print("Cosine similarity between 'données' " + 
               "and 'connaissance' - CBOW : ", 
    model1.wv.similarity('données', 'connaissance'))

### Skip gram

La méthode "skip gram" fait le contraire de ce que fait la méthode "cbow" : elle prédit les mots de contexte d'un mot donné.

In [None]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1,
                                size = 100, window = 5,
                                sg = 1) 

# Print results 
print("Cosine similarity between 'données' " +
          "and 'connaissance' - Skip Gram : ", 
    model2.wv.similarity('données', 'connaissance')) 

In [None]:
word_vectors = model1.wv

In [None]:
print(word_vectors['données'])

In [None]:
import gensim.downloader as api

# api.info("glove-wiki-gigaword-100")
word_vectors = api.load("glove-wiki-gigaword-100")  # load pre-trained word-vectors from gensim-data

In [None]:
result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
result = word_vectors.most_similar(positive=['female', 'lion'], negative=['male'])
print("{}: {:.4f}".format(*result[0]))

In [None]:
print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
result = word_vectors.similar_by_word("cat")
print("{}: {:.4f}".format(*result[0]))

### FastText

In [None]:
from gensim.models import FastText

# from gensim.test.utils import common_texts
# print(common_texts[0])
# ['human', 'interface', 'computer']

model = FastText(data, size=100, window=5, min_count=5, workers=4, sg=1)

In [None]:
model.wv.most_similar("données")

In [None]:
import fasttext.util

fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

In [None]:
ft.get_dimension()

In [None]:
# fasttext.util.reduce_model(ft, 100)
# ft.get_dimension()

In [None]:
ft.get_word_vector('hello').shape

In [None]:
ft.get_nearest_neighbors('hello')

## Embeddings keras

In [None]:
import keras
import spacy
from keras.preprocessing.text import one_hot

Création d'un corpus d'exemple

In [None]:
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

Encodage du corpus en one-hot à l'aide de la fonction keras.

In [None]:
vocab_size=50 
encod_corp=[]
for i,doc in enumerate(corp):
    # taille de vocab 50 pour être sur que chaque mot est encodé sur un entier unique.
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

Padding des documents : la couche d'embedding de keras nécessite des entrées de la même longueur.

In [None]:
#!python -m spacy download fr_core_news_md

In [None]:
# nlp = spacy.load('fr_core_news_md')

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
# length of maximum document. will be nedded whenever create embeddings for the words
maxlen = -1
for doc in corp:
    tokens = nltk.word_tokenize(doc)
    if(maxlen < len(tokens)):
        maxlen = len(tokens)
print("The maximum number of words in any document is : ",maxlen)

In [None]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

In [None]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

## Transformers

In [None]:
import numpy as np
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertConfig
from transformers import CamembertForTokenClassification
from tqdm import tqdm, trange
import tensorflow as tf

In [None]:
sentences, labels = [], []
with open("frwikinews-20130110-pages-articles.txt.tok.stanford-pos", "r", encoding="utf-8") as f:
    for line in f.readlines():
        sentence = []
        sent_tag = []
        tokens = line.replace("\n", "").split(" ")
        for token in tokens:
            splits = token.split("_")
            if len(splits) != 2: continue
            word, tag = splits
            sentence.append(word)
            sent_tag.append(tag)
        sentences.append(" ".join(sentence))
        labels.append(sent_tag)
    f.close()

In [None]:
tags_val = list(set().union(*labels))
tag2idx = {t:i for i,t in enumerate(tags_val)}
# tag2idx["<PAD>"] = len(tag2idx)

In [None]:
lens = np.array(list(map(len, sentences)))
lens.min(), lens.max(), lens.mean()

In [None]:
MAX_LEN = 150
batch_size = 64

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
device

In [None]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base/', do_lower_case=True)

In [None]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print(tokenized_texts[0])

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PONCT"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [None]:
model = CamembertForTokenClassification.from_pretrained("camembert-base/", num_labels=len(tag2idx))

In [None]:
FULL_FINETUNING = True #True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)



In [None]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
epochs = 10
max_grad_norm = 1.0
total_t0 = time.time()

for _ in trange(epochs, desc="Epoch"):
    t0 = time.time()
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss, something_else = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)[0]
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_val[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_val[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)[0]
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)[0]
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
#     eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags_val[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags_val[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
# print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [None]:
import pickle
pickle.dump(model, open('CamemBERT_POS', 'wb'))

# Embeddings de doc

## Doc2Vec

Introduces Gensim's Doc2Vec model and demonstrates its use on the Lee Corpus.

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import os
import gensim
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [None]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

Let's take a look at the training corpus




In [None]:
print(train_corpus[:2])

And the testing corpus looks like this:




In [None]:
print(test_corpus[:2])

Notice that the testing corpus is just a list of lists and does not contain
any tags.




Training the Model
------------------






In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

Build a vocabulary



In [None]:
model.build_vocab(train_corpus)

Essentially, the vocabulary is a dictionary (accessible via
``model.wv.vocab``\ ) of all of the unique words extracted from the training
corpus along with the count (e.g., ``model.wv.vocab['penalty'].count`` for
counts for the word ``penalty``\ ).




In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Now, we can use the trained model to infer a vector for any piece of text
by passing a list of words to the ``model.infer_vector`` function. This
vector can then be compared with other vectors via cosine similarity.




In [None]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

Assessing the Model
-------------------






In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

Let's count how each document ranks with respect to the training corpus

NB. Results vary between runs due to random seeding and very small corpus



In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

Basically, greater than 95% of the inferred documents are found to be most
similar to itself and about 5% of the time it is mistakenly most similar to
another document. Checking the inferred-vector against a
training-vector is a sort of 'sanity check' as to whether the model is
behaving in a usefully consistent manner, though not a real 'accuracy' value.

This is great and not entirely surprising. We can take a look at an example:




In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Notice above that the most similar document (usually the same text) is has a
similarity score approaching 1.0. However, the similarity score for the
second-ranked documents should be significantly lower (assuming the documents
are in fact different) and the reasoning becomes obvious when we examine the
text itself.

We can run the next cell repeatedly to see a sampling other target-document
comparisons.




In [None]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Testing the Model
-----------------

Using the same approach above, we'll infer the vector for a randomly chosen
test document, and compare the document to our model by eye.




In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

## LDA Model

Introduces Gensim's LDA model and demonstrates its use on the NIPS corpus.

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import io
import os.path
import re
import tarfile
import nltk

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    fname = url.split('/')[-1]
    
    # Download the file to local storage first.
    # We can't read it on the fly because of 
    # https://github.com/RaRe-Technologies/smart_open/issues/331
    if not os.path.isfile(fname):
        with smart_open.open(url, "rb") as fin:
            with smart_open.open(fname, 'wb') as fout:
                while True:
                    buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                    if not buf:
                        break
                    fout.write(buf)
                         
    with tarfile.open(fname, mode='r:gz') as tar:
        # Ignore directory entries, as well as files like README, etc.
        files = [
            m for m in tar.getmembers()
            if m.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', m.name)
        ]
        for member in sorted(files, key=lambda x: x.name):
            member_bytes = tar.extractfile(member).read()
            yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

So we have a list of 1740 documents, where each document is a Unicode string. 
If you're thinking about using your own corpus, then you need to make sure
that it's in the same format (list of Unicode strings) before proceeding
with the rest of this tutorial.




In [None]:
print(len(docs))
print(docs[0][:500])

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a
stemmer in this case because it produces more readable words. Output that is
easy to read is very desirable in topic modelling.




In [None]:
# Lemmatize the documents.
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

We find bigrams in the documents. Bigrams are sets of two adjacent words.
Using bigrams we can get phrases like "machine_learning" in our output
(spaces are replaced with underscores); without bigrams we would only get
"machine" and "learning".

Note that in the code below, we find bigrams and then add them to the
original data, because we would like to keep the words "machine" and
"learning" as well as the bigram "machine_learning".

.. Important::
    Computing n-grams of large dataset can be very computationally
    and memory intensive.




In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

We remove rare words and common words based on their *document frequency*.
Below we remove words that appear in less than 20 documents or in more than
50% of the documents. Consider trying to remove words only based on their
frequency, or maybe combining that with this approach.




In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

Finally, we transform the documents to a vectorized form. We simply compute
the frequency of each word, including the bigrams.




In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

Let's see how many tokens and documents we have to train on.




In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

## Word Movers' Distance

Demonstrates using Gensim's implemenation of the WMD.

In [None]:
# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'

These sentences have very similar content, and as such the WMD should be low.
Before we compute the WMD, we want to remove stopwords ("the", "to", etc.),
as these do not contribute a lot to the information in the sentences.




In [None]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)

Now, as mentioned earlier, we will be using some downloaded pre-trained
embeddings. We load these into a Gensim Word2Vec model class.

.. Important::
  The embeddings we have chosen here require a lot of memory.




In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

So let's compute WMD using the ``wmdistance`` method.




In [None]:
distance = model.wmdistance(sentence_obama, sentence_president)
print('distance = %.4f' % distance)

Let's try the same thing with two completely unrelated sentences. Notice that the distance is larger.




In [None]:
sentence_orange = preprocess('Oranges are my favorite fruit')
distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

In [None]:
model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.

distance = model.wmdistance(sentence_obama, sentence_president)  # Compute WMD as normal.
print('distance: %r' % distance)

distance = model.wmdistance(sentence_obama, sentence_orange)
print('distance = %.4f' % distance)

## DistillBERT sentiment analysis

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import urllib.request
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

In [None]:
print(torch.__version__)

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [None]:
batch_1 = df

In [None]:
pd.set_option('display.max_colwidth',None)

In [None]:
batch_1.head()

In [None]:
batch_1[1].value_counts()

In [None]:
# DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Bert
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# chargement modèle et tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Tokenization de nos phrases pour distillBERT
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
# Padding afin que chaque phrase fasse la même taille

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

In [None]:
# On cache ce padding
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [None]:
# On applique le modèle sur nos token avec le masque
model.eval()
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
np.shape(last_hidden_states[0])

In [None]:
# Chaque token obtient un vecteur, ici, seul le token special 'cls' en position 1 nous intéresse
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = batch_1[1]

### Suite

#### Entrainnement d'une reg log

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, random_state=11)

In [None]:
lr_clf = LogisticRegression(C=1)
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

# Comparaison des modèles

In [None]:
# ...