# Project

In [1]:
import json
from collections import defaultdict

# Load data

In [2]:
train_set = json.load(open('corpus/fr/fr.gsd.train.json'))
dev_set = json.load(open('corpus/fr/fr.gsd.dev.json'))
test_set = json.load(open('corpus/fr/fr.gsd.test.json'))

# Take a vue on data_set

In [3]:
for k,v in train_set[:20]:
    print(k)
    print(v)
    break

['Les', 'commotions', 'cérébrales', 'sont', 'devenu', 'si', 'courantes', 'dans', 'ce', 'sport', "qu'", 'on', 'les', 'considére', 'presque', 'comme', 'la', 'routine', '.']
['DET', 'NOUN', 'ADJ', 'AUX', 'VERB', 'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'SCONJ', 'PRON', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'PUNCT']


# Describing shortly the different data sets

In [4]:
# the data may be come from the newspaper

# the number of sentences in each data_set
print('nbr of sentences in train_set : %d'% len(train_set))
print('nbr of sentences in dev_set : %d'% len(dev_set))
print('nbr of sentences in test_set : %d'% len(test_set))

nbr of sentences in train_set : 14450
nbr of sentences in dev_set : 1476
nbr of sentences in test_set : 416


# Three measures of the noisiness of a corpus

- The percentage of **Out-of-Vocabulary (OOV) words**, i.e. words appearing in the test set that are not contained on the train set;
- **[The KL divergence of 3-grams characters](https://aclweb.org/anthology/W16-3905)** distributions estimated on the train and test sets
- perplexity on the test set of a (word level) Language Model estimated on the test
set. The language model can be estimated by KenLM (this tools can also be used
to compute the perpexlity).

In [5]:
def extract_words(data_set):
    '''
    data_set : input data in dimension of (N,M)
    
    return : the set of words appearing the data_set
    '''
    words = set()
    for k,v in data_set:
        words = words.union(k)
    return words
    

### The percentage of Out-of-Vocabulary (OOV) words

In [None]:
words_train_set = extract_words(train_set)
words_test_set = extract_words(test_set)
oov = words_test_set.difference(words_train_set)

In [None]:
print('percentage of oov in train_set : %.2f%%'%(len(oov)/len(words_train_set)*100))
print('percentage of oov in test_set : %.2f%%'%(len(oov)/len(words_test_set)*100))

### TODO:// The KL divergence of 3-grams characters distributions

- we try to mesure the noisiness of a corpus, because the noisiness can do large impact on the performance of model. And a good knowledge can help us to build and train a better model.
- here the metric gives a low value means that there are few noisiness in the corpus, else much noisiness

### TODO:// compute the value of the different metric for the different combination of train and test sets

# Model

### Considering the features

- **the word**  *explain : this feature can directly get the labels which are related to the word*
#### *Windows*
- **a window of 5 words around the word** of interest (i.e. the word we want to predict a label for, the two previous words and the two following words) *explain: these features can make the label more accurate*
#### *Word features*
In this section, we consider these sources of information equally important and normalize each of the four component vectors to unit length
- **counts of left neighbors**
- **counts of right neighbors**
- **binary suffix features**
- **binary shape features**
#### *Distributional features*
- 

In [6]:
def feature_window(i, context, l=2):
    '''
    i : the index of the word in the context
    context : the sentence
    l : a window of size is 2*l+1
    
    return : list of features which are tuple (feature_name, value)
    '''
    # the result of features
    res = []
    # the word
    word = context[i]
    # add the word to the list
    res.append(word)
    punct = [',','.','(',')',':',';','/','?','«','"', '»']
    if word in punct:
        return res
    for k in range(1,l+1):
        # the word of index(word) - k
        res.append('win_i-%d'%k + context[i-k] if i-k>=0 else 'none')
        # the word of index(word) + k
        res.append('win_i+%d'%k+context[i+k] if i+k<len(context) else 'none')
    return res

In [7]:
def feature_suffix(i, context, s=3):
    '''
    i : the index of the word in the context
    context : the sentence
    s : the 1,2,...,s suffix lettre of the word
    
    return : list of features which are tuple (feature_name, value)
    '''
    
    # the result of features
    res = []
    # the word
    word = context[i]
    for k in range(-1, -(s+1), -1):
        # the feature of k-th suffix of the word
        res.append('%d-th_suffix_'%k + word[k:])
    return res

In [8]:
def feature_shape(i, context):
    '''
    i : the index of the word in the context
    context : the sentence
    
    return : list of features which are tuple (feature_name, value)
    '''
    def has_digit(s):
        '''
        check if a string has digit or nor
        '''
        return any(c.isdigit() for c in s)
    
    # the result of features
    res = []
    # the word
    word = context[i]
    
    punct = [',','.','(',')',':',';','/','?','«','"', '»']
    if word in punct:
        res.append('punct')
        return res
    
    ## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    if word.istitle():
        res.append('start_capital')
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    if word.isupper():
        res.append('only_capital')
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    if has_digit(word):
        res.append('has_digit')
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    if '-' in word:
        res.append('has_hyphen')
    # banary feature indicating whether the word has a low hyphen or not, 1:yes, 0:not
    if '_' in word:
        res.append('has_hyphen_low')
    # banary feature indicating whether the letters in the word are all alphanumeric or not, 1:yes, 0:not
    if not word.isalnum():
        res.append('not_alnum')
    
    if '\'' in word:
        res.append('abbr')
    
    ## different morphological
    # aient 
    
    ## 
    # son sa ser ton 
    return res

In [9]:
def feature_distribution(data, freq = 100):
    freq_bigram_left = defaultdict(lambda: defaultdict(int))
    freq_bigram_right = defaultdict(lambda: defaultdict(int))
    for words, labels in data:
        for i in range(len(words)):
            if i > 0:
                freq_bigram_left[words[i]][words[i-1]] += 1
            if i < len(words)-1:
                freq_bigram_right[words[i]][words[i+1]] += 1
    for word, counts in freq_bigram_left.items():
        freq_bigram_left[word] = list(sorted(counts.items(), key=lambda x : x[1], reverse=True))[:freq]
    for word, counts in freq_bigram_right.items():
        freq_bigram_right[word] = list(sorted(counts.items(), key=lambda x : x[1], reverse=True))[:freq]
    return freq_bigram_left, freq_bigram_right

In [19]:
def extract_features(i, context, bigram_left, bigram_right):
    res = feature_window(i, context)
    res += feature_suffix(i, context)
    res += feature_shape(i, context)
#     for i in range(len(bigram_left)):
#         res.append('%d-th_freq_left_'%i + bigram_left[i][0])
#     for i in range(len(bigram_right)):
#         res.append('%d-th_freq_right_'%i + bigram_right[i][0])
    return res

In [20]:
def build_dataset(data):
    dataset = []
    labelset = []
    freq_bigram_left, freq_bigram_right = feature_distribution(data)
    for words, labels in data:
        for i in range(len(words)):
            dataset.append(extract_features(i, words, freq_bigram_left[words[i]], freq_bigram_right[words[i]]))
            labelset.append(labels[i])
    return dataset, labelset

# Define the model

In [21]:
class Perceptron:
    
    def __init__(self, labels):
        
        self.labels = labels
        # Each feature gets its own weight vector, with one weight for
        # each possible label
        self.weights = defaultdict(lambda: defaultdict(float))
        # The accumulated values of the weight vector at the t-th
        # iteration: sum_{i=1}^{n - 1} w_i
        #
        # The current value (w_t) is not yet added. The key of this
        # dictionary is a pair (feature, label)
        self._accum = defaultdict(int)
        # The last time the feature was changed, for the averaging.
        self._last_update = defaultdict(int)
        # Number of examples seen
        self.n_updates = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return
        the best class.'''
        
        # get the scores of all the labels based on the features
        labels, labels_score = self.score(features)
        # get the label whose socre is max
        return labels[max(enumerate(labels_score),key=lambda x: x[1])[0]]
    
    def predict_all(self, features):
        '''
        predict the labels based on the 
        
        Parameters
        ----------
            - features, an iterable
              WARNING: the `value' of the feature is always assumed to be 1.
        
        return : a list of babels predicted
        '''
        predicts = []
        for f in features:
            predicts.append(self.predict(f))
        return predicts
    
    def fit(self, train_set, train_labels):
        from ipywidgets import FloatProgress
        from IPython.display import display
        '''
        Parameters
            - train_set: an iterable of the features of all data
            - train_labels : an iterable of labels 
        '''
        f = FloatProgress(min=0, max=len(train_labels))
        display(f)
        for features, true_label in zip(train_set, train_labels):
            f.value += 1
            #self.average_weights()
            self.update(true_label, self.predict(features), features) 
    
    def score(self, features, labels=None):
        """
        Parameters
        ----------

        - features, an iterable
             a sequence of binary features. Each feature must be
             hashable. WARNING: the `value' of the feature is always
             assumed to be 1.
        - labels, a subset of self.labels
             if not None, the score is computed only for these labels
        """ 
        if not labels:
            # list of scores of the sum of weights associated with each label of all features
            # where the index of this list is the index of list of all labels
            labels_score = [0 for i in range(len(self.labels))]
            for f in features:
                for label in self.labels:
                    # get the weight associated by the feature and label
                    # then add to the list of scores 
                    labels_score[self.labels.index(label)] += self.weights[f][label]
            return self.labels, labels_score
        else :
            labels_score = [0 for i in range(len(labels))]
            for f in features:
                for label in labels:
                    labels_score[labels.index(label)] += self.weights[f][label]
            return labels, labels_score
        

    def update(self, truth, guess, features):
        '''
        if the true label == predicted label, then do nothing 
        else for each feature, update the associated weights of all labels 
        '''
        def upd_feat(label, feature, v):
            param = (label, feature)
            self._accum[param] += (self.n_updates -
                                   self._last_update[param]) * self.weights[feature][label]
            self._last_update[param] = self.n_updates
            self.weights[feature][label] += v
            
        self.n_updates += 1

        if truth == guess:
            return

        for f in features:
            upd_feat(truth, f, 1.0)
            upd_feat(guess, f, -1.0)

    def average_weights(self):
        """
        Average weights of the perceptron

        Training can no longer be resumed.
        """
        for feat, weights in self.weights.items():
            new_feat_weights = defaultdict(float)
            for label, w in weights.items():
                param = (label, feat)
                # Be careful not to add 1 to take into account the
                # last weight vector (without increasing the number of
                # iterations in the averaging)
                total = self._accum[param] + \
                    (self.n_updates + 1 - self._last_update[param]) * w
                averaged = round(total / self.n_updates, 3)
                if averaged:
                    new_feat_weights[label] = averaged
            self.weights[feat] = new_feat_weights
    
    def evaluate(self, test_set, test_labels):
        import numpy as np
        predict_labels = self.predict_all(test_set)
        num_true = np.sum(np.array(predict_labels) == np.array(test_labels))
        num_tatol = len(test_labels)
        accuracy = num_true/num_tatol
        print('true_num: %d    total_num: %d ======> accuracy : %.4f%%'%(num_true, num_tatol, accuracy*100))
        return predict_labels
    

        

    def __getstate__(self):
        """
        Serialization of a perceptron

        We are only serializing the weight vector as a dictionnary
        because defaultdict with lambda can not be serialized.
        """
        # should we also serialize the other attributes to allow
        # learning to continue?
        return {"weights": {k: v for k, v in self.weights.items()}}

    def __setstate__(self, data):
        """
        De-serialization of a perceptron
        """

        self.weights = defaultdict(lambda: defaultdict(float), data["weights"])
        # ensure we are no longer able to continue training
        self._accum = None
        self._last_update = None

In [22]:
def all_lables(data_set):
    res = set()
    for words, labels in data_set:
        res = res | set(labels)
    return list(res)

# Build the model

In [23]:
labels = all_lables(train_set)
train_dataset, train_labels = build_dataset(train_set)
test_dataset, test_labels = build_dataset(test_set)

In [24]:
train_dataset[:10]

[['Les',
  'none',
  'win_i+1commotions',
  'none',
  'win_i+2cérébrales',
  '-1-th_suffix_s',
  '-2-th_suffix_es',
  '-3-th_suffix_Les',
  'start_capital'],
 ['commotions',
  'win_i-1Les',
  'win_i+1cérébrales',
  'none',
  'win_i+2sont',
  '-1-th_suffix_s',
  '-2-th_suffix_ns',
  '-3-th_suffix_ons'],
 ['cérébrales',
  'win_i-1commotions',
  'win_i+1sont',
  'win_i-2Les',
  'win_i+2devenu',
  '-1-th_suffix_s',
  '-2-th_suffix_es',
  '-3-th_suffix_les'],
 ['sont',
  'win_i-1cérébrales',
  'win_i+1devenu',
  'win_i-2commotions',
  'win_i+2si',
  '-1-th_suffix_t',
  '-2-th_suffix_nt',
  '-3-th_suffix_ont'],
 ['devenu',
  'win_i-1sont',
  'win_i+1si',
  'win_i-2cérébrales',
  'win_i+2courantes',
  '-1-th_suffix_u',
  '-2-th_suffix_nu',
  '-3-th_suffix_enu'],
 ['si',
  'win_i-1devenu',
  'win_i+1courantes',
  'win_i-2sont',
  'win_i+2dans',
  '-1-th_suffix_i',
  '-2-th_suffix_si',
  '-3-th_suffix_si'],
 ['courantes',
  'win_i-1si',
  'win_i+1dans',
  'win_i-2devenu',
  'win_i+2ce',
  '-1-t

In [25]:
p = Perceptron(labels)

In [26]:
p.fit(train_dataset, train_labels)

FloatProgress(value=0.0, max=345009.0)

In [27]:
predicts = p.evaluate(test_dataset, test_labels)



In [29]:
freq_bigram_left, freq_bigram_right = feature_distribution(train_set, freq=500)

In [31]:
freq_bigram_left
# for word_left in freq_bigram_left['si']:
#     print(word_left)

defaultdict(<function __main__.feature_distribution.<locals>.<lambda>()>,
            {'commotions': [('Les', 1)],
             'cérébrales': [('commotions', 1)],
             'sont': [('ne', 50),
              ('se', 48),
              ('ils', 37),
              (',', 32),
              ('qui', 27),
              ('Ils', 22),
              ('et', 16),
              ('elles', 14),
              (')', 13),
              ('ce', 13),
              ('prix', 13),
              ('y', 13),
              ('Elles', 12),
              ('plats', 11),
              ('habitants', 8),
              ('Ce', 7),
              ('chambres', 5),
              ('derniers', 5),
              ('mais', 5),
              ('»', 5),
              ('produits', 4),
              ('__DIGIT__', 4),
              ('certains', 3),
              ('cours', 3),
              ('noms', 3),
              ('en', 3),
              ('groupe', 3),
              ('eux', 3),
              ('principaux', 3),
              ('quelle

In [None]:
def save_model(data, filename):
    import pickle 
    with open(filename, 'w')as fp:
        json.dump(data, fp)

weights = p.__getstate__()
save_model(weights, 'mymodel.model')

In [None]:
import numpy as np
wrong = np.array(test_dataset)[np.array(predicts_labels) != np.array(test_labels)]
wrong_label = np.array(test_labels) [np.array(predicts_labels) != np.array(test_labels)]

In [None]:
## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    res.append(('start_capital',1 if word.istitle() else 0))
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    res.append(('only_capital',1 if word.isupper() else 0))
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    res.append(('has_digit', 1 if has_digit(word) else 0))
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    res.append(('has_hyphen', 1 if '-' in word else 0))
    # banary feature indicating whether the word has a low hyphen or not, 1:yes, 0:not
    res.append(('has_hyphen_low', 1 if '_' in word else 0))
    # banary feature indicating whether the letters in the word are all alphanumeric or not, 1:yes, 0:not
    res.append(('isalnum', 1 if word.isalnum() else 0))