# Project

In [4]:
import json

# Load data

In [5]:
train_set = json.load(open('corpus/fr/fr.gsd.train.json'))
dev_set = json.load(open('corpus/fr/fr.gsd.dev.json'))
test_set = json.load(open('corpus/fr/fr.gsd.test.json'))

# Take a vue on data_set

In [6]:
for k,v in train_set[:20]:
    print(k)
    print(v)


['Les', 'commotions', 'cérébrales', 'sont', 'devenu', 'si', 'courantes', 'dans', 'ce', 'sport', "qu'", 'on', 'les', 'considére', 'presque', 'comme', 'la', 'routine', '.']
['DET', 'NOUN', 'ADJ', 'AUX', 'VERB', 'ADV', 'ADJ', 'ADP', 'DET', 'NOUN', 'SCONJ', 'PRON', 'PRON', 'VERB', 'ADV', 'ADP', 'DET', 'NOUN', 'PUNCT']
["L'", 'œuvre', 'est', 'située', 'dans', 'la', 'galerie', 'des', 'batailles', ',', 'dans', 'le', 'château', 'de', 'Versailles', '.']
['DET', 'NOUN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP+DET', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT']
['Le', 'comportement', 'de', 'la', 'Turquie', 'vis-à-vis', 'du', 'problème', 'palestinien', 'a', 'fait', "qu'", 'elle', "n'", 'est', 'plus', 'en', 'odeur', 'de', 'sainteté', 'auprès', 'de', 'la', 'communauté', 'juive', 'en', 'générale', ',', 'et', 'américaine', 'en', 'particulier', '.']
['DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'NOUN', 'ADP+DET', 'NOUN', 'ADJ', 'AUX', 'VERB', 'SCONJ', 'PRON', 'ADV', 'VERB', 'ADV', 'ADP', '

# Describing shortly the different data sets

In [8]:
# the data may be come from the newspaper

# the number of sentences in each data_set
print('nbr of sentences in train_set : %d'% len(train_set))
print('nbr of sentences in dev_set : %d'% len(dev_set))
print('nbr of sentences in test_set : %d'% len(test_set))

nbr of train_set : 14450
nbr of dev_set : 1476
nbr of test_set : 416


# Three measures of the noisiness of a corpus

- The percentage of **Out-of-Vocabulary (OOV) words**, i.e. words appearing in the test set that are not contained on the train set;
- **[The KL divergence of 3-grams characters](https://aclweb.org/anthology/W16-3905)** distributions estimated on the train and test sets
- perplexity on the test set of a (word level) Language Model estimated on the test
set. The language model can be estimated by KenLM (this tools can also be used
to compute the perpexlity).

In [25]:
def extract_words(data_set):
    '''
    data_set : input data in dimension of (N,M)
    
    return : the set of words appearing the data_set
    '''
    words = set()
    for k,v in data_set:
        words = words.union(k)
    return words
    

### The percentage of Out-of-Vocabulary (OOV) words

In [29]:
words_train_set = extract_words(train_set)
words_test_set = extract_words(test_set)
oov = words_test_set.difference(words_train_set)

In [39]:
print('percentage of oov in train_set : %.2f%%'%(len(oov)/len(words_train_set)*100))
print('percentage of oov in test_set : %.2f%%'%(len(oov)/len(words_test_set)*100))

percentage of oov in train_set : 1.38%
percentage of oov in test_set : 17.84%


### TODO:// The KL divergence of 3-grams characters distributions

- we try to mesure the noisiness of a corpus, because the noisiness can do large impact on the performance of model. And a good knowledge can help us to build and train a better model.
- here the metric gives a low value means that there are few noisiness in the corpus, else much noisiness

### TODO:// compute the value of the different metric for the different combination of train and test sets

# Model

### Considering the features

- **the word**  *explain : this feature can directly get the labels which are related to the word*
#### *Windows*
- **a window of 5 words around the word** of interest (i.e. the word we want to predict a label for, the two previous words and the two following words) *explain: these features can make the label more accurate*
#### *Word features*
In this section, we consider these sources of information equally important and normalize each of the four component vectors to unit length
- **counts of left neighbors**
- **counts of right neighbors**
- **binary suffix features**
- **binary shape features**
#### *Distributional features*
- 

In [3]:
def feature_window(i, context, l=2):
    '''
    i : the index of the word in the context
    context : the sentence
    l : a window of size is 2*l+1
    
    return : list of features which are tuple (feature_name, value)
    '''
    # the result of features
    res = []
    # the word
    word = context[i]
    # add the word to the list
    res.append(('word', word))
    for k in range(1,l+1):
        # the word of index(word) - k
        res.append(('win_i-%d'%k, context[i-k] if i-k>=0 else None))
        # the word of index(word) + k
        res.append(('win_i+%d'%k, context[i+k] if i+k<len(context) else None))
    return res

In [16]:
def feature_suffix(i, context, s=3):
    '''
    i : the index of the word in the context
    context : the sentence
    s : the 1,2,...,s suffix lettre of the word
    
    return : list of features which are tuple (feature_name, value)
    '''
    
    # the result of features
    res = []
    # the word
    word = context[i]
    for k in range(-1, -(s+1), -1):
        # the feature of k-th suffix of the word
        res.append(('%d-th_suffix'%k, word[k:]))
    return res

In [17]:
feature_suffix(2, train_set[0][0])

[('-1-th_suffix', 's'), ('-2-th_suffix', 'es'), ('-3-th_suffix', 'les')]

In [44]:
def feature_shape(i, context):
    '''
    i : the index of the word in the context
    context : the sentence
    
    return : list of features which are tuple (feature_name, value)
    '''
    def has_digit(s):
        '''
        check if a string has digit or nor
        '''
        return any(c.isdigit() for c in s)
    
    # the result of features
    res = []
    # the word
    word = context[i]
    
    ## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    res.append(('start_capital',1 if word.istitle() else 0))
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    res.append(('only_capital',1 if word.isupper() else 0))
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    res.append(('has_digit', 1 if has_digit(word) else 0))
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    res.append(('has_hyphen', 1 if '-' in word else 0))
    
    res.append(('has_hyphen_low', 1 if '_' in word else 0))
    
    ## different morphological
    # aient 
    
    ## 
    # son sa ser ton 
    return res

In [47]:
def extract_features(i, context):
    res = feature_window(i, context)
    res += feature_suffix(i, context)
    res += feature_shape(i, context)
    return res

In [48]:
def build_dataset(data):
    dataset = []
    labelset = []
    for words, labels in data:
        for i in range(len(words)):
            dataset.append(extract_features(i, words))
            labelset.append(labels[i])
    return dataset, labelset

In [49]:
train_dataset, train_labels = build_dataset(train_set)

In [50]:
train_dataset

[[('word', 'Les'),
  ('win_i-1', None),
  ('win_i+1', 'commotions'),
  ('win_i-2', None),
  ('win_i+2', 'cérébrales'),
  ('-1-th_suffix', 's'),
  ('-2-th_suffix', 'es'),
  ('-3-th_suffix', 'Les'),
  ('start_capital', 1),
  ('only_capital', 0),
  ('has_digit', 0),
  ('has_hyphen', 0),
  ('has_hyphen_low', 0)],
 [('word', 'commotions'),
  ('win_i-1', 'Les'),
  ('win_i+1', 'cérébrales'),
  ('win_i-2', None),
  ('win_i+2', 'sont'),
  ('-1-th_suffix', 's'),
  ('-2-th_suffix', 'ns'),
  ('-3-th_suffix', 'ons'),
  ('start_capital', 0),
  ('only_capital', 0),
  ('has_digit', 0),
  ('has_hyphen', 0),
  ('has_hyphen_low', 0)],
 [('word', 'cérébrales'),
  ('win_i-1', 'commotions'),
  ('win_i+1', 'sont'),
  ('win_i-2', 'Les'),
  ('win_i+2', 'devenu'),
  ('-1-th_suffix', 's'),
  ('-2-th_suffix', 'es'),
  ('-3-th_suffix', 'les'),
  ('start_capital', 0),
  ('only_capital', 0),
  ('has_digit', 0),
  ('has_hyphen', 0),
  ('has_hyphen_low', 0)],
 [('word', 'sont'),
  ('win_i-1', 'cérébrales'),
  ('win_i+