In [None]:
import csv
import os
import numpy as np
import variables

import spacy
nlp = spacy.load('fr')

In [None]:
def get_embeddings(vocab):
    max_rank = max(lex.rank for lex in vocab if lex.has_vector)
    vectors = np.zeros((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank,:] = lex.vector
    vectors = np.nan_to_num(vectors)
    np.save('cache/embedding_weights.npy', vectors)
    print 'Embeddings:',vectors.shape
get_embeddings(nlp.vocab)

In [None]:
def tokens2ids(tokens):
    ids = np.zeros((variables.MAX_LEN,))
    idx = 0
    
    for token in tokens:
        if idx >= variables.MAX_LEN:
            break
            
        if token.is_space or token.is_punct or token.is_stop:
            continue
            
        if token.has_vector:
            ids[idx] = token.rank
        else:
            ids[idx] = 0
        
        idx+= 1
            
    return ids

In [None]:
def tokens2vec(tokens):
    vec = np.zeros((nlp.vocab.vectors_length,))
    count = 0.
    
    for token in tokens:           
        if token.is_space or token.is_punct or token.is_stop:
            continue
            
        if token.has_vector:
            vec += token.vector
            count += 1.
    
    if count > 0:
        vec /= count
            
    return np.nan_to_num(vec)

In [None]:
def read_data(datafile, labelfile, savedir, phase, save=False):
    if not os.path.exists(savedir):
        os.mkdir(savedir)
    
    with open(datafile,'rU') as csvfile:        
        filereader = csv.reader(csvfile, delimiter=';')
        texts, titles, ratings = zip(*[(row[1], row[2], row[3]) for row in filereader])
        
    size = len(texts[1:])
    idx_to_ignore = []

    # Processing the review
    X_text_vec = np.zeros((size, nlp.vocab.vectors_length))
    X_text_ids = np.zeros((size, variables.MAX_LEN))
    print '### Processing review content ###'
    for idx,text in enumerate(texts[1:]):
        if (idx+1) % 10000 == 0 :
            print 'Index {}'.format(idx+1)
        
        doc = nlp.tokenizer(unicode(text, 'utf-8').lower())
        X_text_vec[idx,:] = tokens2vec(doc)
        X_text_ids[idx,:] = tokens2ids(doc)
        
        if np.amax(X_text_ids[idx,:]) == 0:
            # Ignore if there are no embeddings for the review
            idx_to_ignore.append(idx)

    # Processing the title
    X_titl_vec = np.zeros((size, nlp.vocab.vectors_length))
    X_titl_ids = np.zeros((size, variables.MAX_LEN))
    print '### Processing review title ###'
    for idx,title in enumerate(titles[1:]):
        if (idx+1) % 10000 == 0 :
            print 'Index {}'.format(idx+1)
            
        doc = nlp.tokenizer(unicode(title, 'utf-8').lower())
        X_titl_vec[idx,:] = tokens2vec(doc)
        X_titl_ids[idx,:] = tokens2ids(doc)
    
    
    # Processing the ratings
    X_ratg = np.asarray(ratings[1:])[:,np.newaxis]
    
    # Processing the labels
    if phase == 'train':
        with open(labelfile,'rU') as csvfile:
            filereader = csv.reader(csvfile, delimiter=';')
            labels = [row[1] for row in filereader]
        y = np.asarray(labels[1:])[:,np.newaxis]
        
    # Removing data whithout embedding
    if phase == 'train':
        X_text_vec = np.delete(X_text_vec, idx_to_ignore, axis=0).astype('float32')
        X_text_ids = np.delete(X_text_ids, idx_to_ignore, axis=0).astype('int32')
        X_titl_vec = np.delete(X_titl_vec, idx_to_ignore, axis=0).astype('float32')
        X_titl_ids = np.delete(X_titl_ids, idx_to_ignore, axis=0).astype('int32')
        X_ratg = np.delete(X_ratg, idx_to_ignore, axis=0).astype('float32')
        y = np.delete(y, idx_to_ignore, axis=0).astype('int32')
        
    if save:
        np.save(savedir+'text_vec.npy', X_text_vec)
        np.save(savedir+'text_ids.npy', X_text_ids)
        np.save(savedir+'titl_vec.npy', X_titl_vec)
        np.save(savedir+'titl_ids.npy', X_titl_ids)
        np.save(savedir+'ratg.npy', X_ratg)
        if phase == 'train':
            np.save(savedir+'labels.npy', y)

    print 'Texts:', X_text_vec.shape, X_text_ids.shape
    print 'Titles:', X_titl_vec.shape, X_titl_ids.shape
    print 'Ratings:', X_ratg.shape
    if phase == 'train':
        print 'Labels:',y.shape

In [None]:
datafile = 'data/input_train.csv'
labelfile = 'data/output_train.csv'
savedir = 'cache/train/'
read_data(datafile, labelfile, savedir, 'train', save=True)

In [None]:
datafile = 'data/input_test.csv'
savedir = 'cache/test/'
read_data(datafile, None, savedir, 'test', save=True)