# Word Embedding for Sequence Processing

**The goal of this practical is to use pre-trained word embedding for adressing the sequence prediction tasks studied in week 2: PoS and chunking.**

In [1]:
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors

## 0) Loading PoS (or chunking) datasets (small or large)

In [2]:
def load(filename):
    listeDoc = list()
    with open(filename, "r") as f:
        doc = list()
        for ligne in f:
            #print "l : ",len(ligne)," ",ligne
            if len(ligne) < 2: # fin de doc
                listeDoc.append(doc)
                doc = list()
                continue
            mots = ligne.replace("\n","").split(" ")
            doc.append((mots[0],mots[2])) # mettre mots[2] à la place de mots[1] pour le chuncking
    return listeDoc

In [3]:
bSmall = False

if(bSmall==True):
    filename = "../tme2/conll2000/conll2000/chtrain.txt" 
    filenameT = "../tme2/conll2000/conll2000/chtest.txt" 

else:
    # Larger corpus .
    filename = "../tme2/conll2000/conll2000/train.txt" 
    filenameT = "../tme2/conll2000/conll2000/test.txt" 

alldocs = load(filename)
alldocsT = load(filenameT)

print(len(alldocs)," docs read")
print(len(alldocsT)," docs (T) read")

8936  docs read
2012  docs (T) read


# 1) Word embedding for classifying each word

### Pre-trained word2vec

In [4]:
import gensim.downloader as api
bload = True
fname = "word2vec-google-news-300"
sdir = "" # Change

if(bload==True):
    wv_pre_trained = KeyedVectors.load(sdir+fname+".dat")
else:    
    wv_pre_trained = api.load(fname)
    wv_pre_trained.save(sdir+fname+".dat")

### Some token on the dataset are missing, we will encode them with a random vector
This is sub-optimal, but we need to do something

In [5]:
def randomvec():
    default = np.random.randn(300)
    default = default  / np.linalg.norm(default)
    return default

In [4]:
np.random.seed(seed=10) # seed the randomness

dictadd = dict()
cpt=0
for d in alldocs:
    cpt+=1
    print(" ****** Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            
for d in alldocsT:
    cpt+=1
    print(" ****** TEST Document ******",cpt)
    for (x,pos) in d:
        if (not (x in wv_pre_trained) and not (x in dictadd)):
            print(x," not in WE, adding it with random vector")
            dictadd[x] = randomvec()
            #wv_pre_trained.add_vector(x,randomvec())
            

 ****** Document ****** 1


NameError: name 'wv_pre_trained' is not defined

### Add the (key-value) 'random' word embeddings for missing inputs

In [7]:
## YOUR CODE HERE
wv_pre_trained["random"]=randomvec()
print(wv_pre_trained["random"])


[-7.87200464e-04  1.60092395e-02 -3.36059928e-02  6.48825467e-02
  4.70259460e-03  3.46915461e-02 -7.11487383e-02  1.18286544e-02
 -3.79935987e-02 -8.77709538e-02 -7.19027501e-03  2.53977217e-02
  2.14974508e-02  1.24082128e-02 -1.23100895e-02 -7.33138546e-02
  3.34172463e-03 -3.28111798e-02  5.44306152e-02  1.42146405e-02
  2.09723879e-02 -6.77637476e-03 -5.07333167e-02 -1.18474744e-03
  1.81669518e-02  4.83812466e-02 -3.42427902e-02  9.46366563e-02
  5.89774661e-02 -9.17315017e-03 -6.52074888e-02 -1.49930696e-04
  3.89650627e-03 -8.03749785e-02 -4.32538353e-02 -4.38350737e-02
  1.22720627e-02  4.99978438e-02 -1.46361943e-02  1.35591654e-02
  4.98693921e-02  5.20003475e-02  1.77748632e-02  8.11136961e-02
 -6.68863812e-03  4.47420068e-02 -1.06641511e-02 -2.59523164e-03
  7.36436173e-02 -2.12562177e-02  3.43948156e-02  1.47454515e-02
 -4.00881022e-02  1.06461179e-02 -6.62997365e-02 -4.30645980e-02
  4.07102779e-02 -4.26423103e-02  2.32613310e-02 -5.80925457e-02
 -1.31760293e-03 -3.71659

### Store the train and test datasets: a word embedding for each token in the sequences

In [8]:
wvectors = [wv_pre_trained[word] if word not in dictadd.keys() else wv_pre_trained["random"] for d in alldocs for word, pos in d]
wvectorsT = [wv_pre_trained[word] if word not in dictadd.keys() and word!="Good-bye" else wv_pre_trained["random"] for d in alldocsT for word, pos in d]

# On s'est rendu compte que Good-bye posait problème, d'où l'exception..

### Check the size of your train/test datasets

In [9]:
## YOUR CODE HERE
print(len(wvectors),len(wvectorsT))

211727 47377


### Collecting train/test labels

In [6]:
# Labels train/test

buf2 = [[pos for m,pos in d ] for d in alldocs]
cles = []
[cles.extend(b) for b in buf2]
cles = np.unique(np.array(cles))
cles2ind = dict(zip(cles,range(len(cles))))
nCles = len(cles)
print(nCles," keys in the dictionary")
labels  = np.array([cles2ind[pos] for d in alldocs for (m,pos) in d ])
#np.array([cles2ind[pos] for (m,pos) in d for d in alldocs])
labelsT  = np.array([cles2ind.setdefault(pos,len(cles)) for d in alldocsT for (m,pos) in d ])

print(len(cles2ind)," keys in the dictionary")
print(labels[:10],labelsT[:10])


22  keys in the dictionary
23  keys in the dictionary
[ 5  6  5 15 10 20 20 20 20  5] [ 5 15 15  5 15 15 10  5 10  5]


In [12]:
print(labels.shape)
print(labelsT.shape)
print(labels)

(211727,)
(47377,)
[ 5  6  5 ... 15  1 21]


### Train a Logistic Regression Model! 
**An compare performances to the baseline and sequence models (HMM/CRF) or practical 2a**

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

wvectors_scaled = scaler.fit_transform(wvectors)
wvectorsT_scaled = scaler.transform(wvectorsT)

classifier = LogisticRegression(max_iter=1000)

classifier.fit(wvectors_scaled, labels)

pred = classifier.predict(wvectorsT_scaled)

# Calculate accuracy
accuracy = accuracy_score(pred, labelsT)

print("Accuracy:", accuracy)

Accuracy: 0.6914536589484349


# 2) Using word embedding with CRF

## We will define the following features functions for CRF

In [23]:
def features_wv(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index])
    d = {'f'+str(i):v[i] for i in range(300)}
    return d

def features_structural(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
     ## We will define the following features functions for CRF## We will define the following features functions for CRF   'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
def features_wv_plus_structural(sentence, index):
    v = wv_pre_trained.get_vector(sentence[index]) 
    d = {'f'+str(i):v[i] for i in range(300)}

    return {**d, **features_structural(sentence, index)}

## [Question]: explain what the 3 feature functions encode and what their differences are

Première fonction : capture l'information sémantique 
<br>Deuxième fonction : capture l'information structurelle et syntaxique
<br>Troisème fonction : combine les deux fonctions définies auparavant.

### You can now train a CRF with the 3 features and analyse the results

In [24]:
from nltk.tag.crf import CRFTagger

tagger = ## YOUR CODE HERE
## Train the model                  
## Evaluate performances

SyntaxError: invalid syntax (1961192166.py, line 3)