In [1]:
import json
from collections import defaultdict

## Load data

and tack a look on the datas and labels

In [2]:
train_set = json.load(open('fr.ud.train.json'))
dev_set = json.load(open('fr.ud.dev.json'))
test_set = json.load(open('fr.ud.test.json'))
minecraft_set = json.load(open('minecraft.json'))
for words, labels in train_set:
    print(words)
    print(labels)
    break
for words, labels in minecraft_set:
    print(words)
    print(labels)
    break

['Les', 'commotions', 'cérébrales', 'sont', 'devenu', 'si', 'courantes', 'dans', 'ce', 'sport', "qu'", 'on', 'les', 'considére', 'presque', 'comme', 'la', 'routine', '.']
['DET', 'NOUN', 'ADJ', 'VERB', 'VERB', 'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'SCONJ', 'PRON', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'PUNCT']
['HG3', ';)', 'si', 'ous', 'voulez', 'x))))']
['PROPN', 'INTJ', 'SCONJ', 'PRON', 'VERB', 'INTJ']


## Get all possible labels

In [3]:
def all_lables(data_set):
    res = set()
    for words, labels in data_set:
        res = res | set(labels)
    return list(res)
labels = all_lables(train_set)
labels

['PRON',
 'NOUN',
 'SYM',
 'NUM',
 'X',
 'ADJ',
 'DET',
 'VERB',
 'PUNCT',
 'ADV',
 'ADP',
 'PROPN',
 'ADP+DET',
 'CCONJ',
 'ADP+PRON',
 'INTJ',
 'SCONJ',
 'PART']

## Feature extraction

Each example (word) of the dataset is going to be represented by a feature vector. Now we are considering the following *feature patterns*
 
- the 3 last characters of the word
- the last character of the word
- the first character of the word
- the word
- a binary feature indicating whether the word starts with a capital letter or not
- a binary feature indicating whether the word is made only of capital letters or not
- the word at position i − 1 and i − 2
- the word at position i + 1 and i + 2

where i is the index of the word in its sentence

In [4]:
def sparse(i, context):
    word = context[i]
    res = []
    res.append(('3_last',word[-3:]))
    res.append(('last_c',word[-1]))
    res.append(('firt_c',word[0]))
    res.append(('word',word))
    res.append(('start_capital',1 if word.istitle() else 0))
    res.append(('only_capital',1 if word.isupper() else 0))
    res.append(('i-1',context[i-1] if i>0 else None))
    res.append(('i-2',context[i-2] if i>1 else None))
    res.append(('i+1',context[i+1] if len(context)>i+1 else None))
    res.append(('i+2',context[i+2] if len(context)>i+2 else None))
    return res

## Define the model

In [5]:
class Perceptron:

    def __init__(self, labels):
        self.labels = labels
        # Each feature gets its own weight vector, with one weight for
        # each possible label
        self.weights = defaultdict(lambda: defaultdict(float))
        # The accumulated values of the weight vector at the t-th
        # iteration: sum_{i=1}^{n - 1} w_i
        #
        # The current value (w_t) is not yet added. The key of this
        # dictionary is a pair (feature, label)
        self._accum = defaultdict(int)
        # The last time the feature was changed, for the averaging.
        self._last_update = defaultdict(int)
        # Number of examples seen
        self.n_updates = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return
        the best class.'''
        labels, labels_score = self.score(features)
        return labels[max(enumerate(labels_score),key=lambda x: x[1])[0]]
    
    def predict_all(self, features):
        predicts = []
        for f in features:
            predicts.append(self.predict(f))
        return predicts
    
    def fit(self, train_set, train_labels):
        for features, true_label in zip(train_set, train_labels):
            self.update(true_label, self.predict(features), features) 
    
    def score(self, features, labels=None):
        """
        Parameters
        ----------

        - features, an iterable
             a sequence of binary features. Each feature must be
             hashable. WARNING: the `value' of the feature is always
             assumed to be 1.
        - labels, a subset of self.labels
             if not None, the score is computed only for these labels
        """ 
        if not labels:
            labels_score = [0 for i in range(len(self.labels))]
            for f in features:
                for label in self.labels:
                    labels_score[self.labels.index(label)] += self.weights[f][label]
            return self.labels, labels_score
        else :
            labels_score = [0 for i in range(len(labels))]
            for f in features:
                for label in labels:
                    labels_score[labels.index(label)] += self.weights[f][label]
            return labels, labels_score
        

    def update(self, truth, guess, features):
        def upd_feat(label, feature, v):
            param = (label, feature)
            self._accum[param] += (self.n_updates -
                                   self._last_update[param]) * self.weights[feature][label]
            self._last_update[param] = self.n_updates
            self.weights[feature][label] += v
            
        self.n_updates += 1

        if truth == guess:
            return

        for f in features:
            upd_feat(truth, f, 1.0)
            upd_feat(guess, f, -1.0)

    def average_weights(self):
        """
        Average weights of the perceptron

        Training can no longer be resumed.
        """
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for label, w in weights.items():
                param = (label, feat)
                # Be careful not to add 1 to take into account the
                # last weight vector (without increasing the number of
                # iterations in the averaging)
                total = self._accum[param] + \
                    (self.n_updates + 1 - self._last_update[param]) * w
                averaged = round(total / self.n_updates, 3)
                if averaged:
                    new_feat_weights[label] = averaged
            self.weights[feat] = new_feat_weights

    def __getstate__(self):
        """
        Serialization of a perceptron

        We are only serializing the weight vector as a dictionnary
        because defaultdict with lambda can not be serialized.
        """
        # should we also serialize the other attributes to allow
        # learning to continue?
        return {"weights": {k: v for k, v in self.weights.items()}}

    def __setstate__(self, data):
        """
        De-serialization of a perceptron
        """

        self.weights = defaultdict(lambda: defaultdict(float), data["weights"])
        # ensure we are no longer able to continue training
        self._accum = None
        self._last_update = None

Build the model with all the labels possible

In [6]:
perceptron = Perceptron(labels)

Train the model

In [7]:
for words, true_labels in train_set:
    for i in range(len(words)):
        features = sparse(i, words)
        label_predict = perceptron.predict(features)
        perceptron.update(true_labels[i], label_predict, features)

Predict on by one

In [8]:
predicts_test = []
all_true_labels = []
for words, true_labels in test_set:
    for i in range(len(words)):
        features = sparse(i, words)
        label_predict = perceptron.predict(features)
        predicts_test.append(label_predict)
        all_true_labels.append(true_labels[i])

Define the function of *accuracy*

In [9]:
import numpy as np
def accuracy(predicts_test, all_true_labels):
    return np.sum(np.array(predicts_test) == np.array(all_true_labels)) / len(predicts_test)

In [10]:
accuracy(predicts_test, all_true_labels)

0.9163416136316977

Build the matrix of dataset and the target from the datas origin and then train the model

In [11]:
def build_dataset(data):
    dataset = []
    labelset = []
    for words, labels in data:
        for i in range(len(words)):
            dataset.append(sparse(i, words))
            labelset.append(labels[i])
    return dataset, labelset

In [12]:
train_dataset, train_labels = build_dataset(train_set)

In [13]:
p = Perceptron(labels)
p.fit(train_dataset, train_labels)

In [14]:
test_dataset, test_labels = build_dataset(test_set)
accuracy(p.predict_all(test_dataset), test_labels)

0.9163416136316977