In [1]:
from sklearn.linear_model import LogisticRegression

def load_data(filename):
    with open(filename) as opened_file:
        data = [tuple(line.split("\t")) for line in opened_file]
    return [datum[0] for datum in data], [int(datum[1]) for datum in data]

class LogisticRegressionModel(object):
    def __init__(self, embedder):
        # embedder to bedzie klasa ktora przez was bedzie napisana
        self.embedder = embedder
        self.model = LogisticRegression()

    def fit(self, X, Y):
        # tutaj nastepuje uczenie embeddingu
        self.embedder.train_embeddings(X)
        embedded = [self.embedder.embed(x) for x in X]
        # upewnienie sie ze embedding ma staly wymiar.
        # Nie przejscie tej asercji oznacza niezaliczenie zadania
        assert(len(set(len(embedding) for embedding in embedded))==1)
        self.model.fit(embedded, Y)

    def predict(self, X):
        embedded = [self.embedder.embed(x) for x in X]
        # j.w.
        assert(len(set(len(embedding) for embedding in embedded))==1)
        return self.model.predict(embedded)

    def score(self, X_test, Y_test):
        assert(len(X_test)==len(Y_test))
        predictions = self.predict(X_test)
        matching = sum(y1==y2 for y1, y2 in zip(predictions, Y_test))
        return matching/len(Y_test)


In [2]:
X, Y = load_data('train_data')

In [7]:
import tensorflow as tf
import numpy as np
import math
import uuid

#Przemysław Onak
from sklearn.feature_extraction.text import CountVectorizer

class Embedder(object):
    def __init__(self, sentence_length = 25, embed_dimension = 250, l1_fil = 64, l1_ker = 5, epochs = 5):        
            self.sentence_length = sentence_length
            self.embed_dimension = embed_dimension
            self.l1_fil = l1_fil
            self.l1_ker = l1_ker
            self.epochs = 5
    
    # zamienia zdanie na reprezentację za pomocą wektora z indeksami w bag of words
    def process_string(self, sentence):
        analyzer = self.vectorizer.build_analyzer()
        result = [self.word_dict[word] for word in analyzer(sentence) if word in self.word_dict]
        result = result[:self.sentence_length] # unicemy zdania dłuższe niż sentence length
        result.extend([0] * (self.sentence_length - len(result))) #padowanie do self.sentence_length
        return result
    
    def train_embeddings(self, data):
        """
        Trains the embedder on the given data.
        The data is a list of sentences, each sentence is a single string.
        """ 
        self.vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             lowercase = True,    \
                             stop_words = None,   \
                             max_features = None) 
        self.vectorizer.fit(data)
        self.word_dict = self.vectorizer.vocabulary_
        
        NUM_WORDS=len(self.word_dict)
        self.NUM_WORDS = NUM_WORDS
        
        # tablica z wektorowymi reprezentacjami zdań 
        train = [self.process_string(sentence) for sentence in data]
         
        # model tensorflow c&w    
        inputs = tf.placeholder(tf.int32, shape=[None, self.sentence_length])
        false_inputs = tf.placeholder(tf.int32, shape=[None, self.sentence_length])
        embedding_matrix = tf.Variable(tf.random_uniform([NUM_WORDS, self.embed_dimension], -1.0, 1.0), name="embedder")

        def get_output_tensor(scope, input_tensor):
            with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
                one_hot = tf.one_hot(input_tensor, depth=NUM_WORDS)

                # poniewaz tensorflow nie obsluguje broadcastu mnozenia macierzy,
                # musimy pomanipulowac naszym tensorem
                reshaped = tf.reshape(one_hot, (-1, NUM_WORDS))
                embedded = tf.matmul(reshaped, embedding_matrix)
                embedded = tf.reshape(embedded, (-1, self.sentence_length, self.embed_dimension))

                prev = embedded
                
                # dodajemy warstwy konwolucji, po ktorych następuje maxpool
                layer1 = tf.layers.conv1d(prev, self.l1_fil, self.l1_ker, padding='same', activation=tf.nn.relu)
                prev = layer1
                
                #layer2 = tf.layers.conv1d(prev, 64, 3, padding='same', activation=tf.nn.relu)
                #prev = layer2

                pooling = tf.layers.max_pooling1d(prev, 3, 2)

                # zeby polaczyc kanaly wyplaszczamy dane, a nastepnie wrzucamy
                # do zwyklej sieci fully connected
                flattened = tf.contrib.layers.flatten(pooling)
                layer3 = tf.layers.dense(flattened, 50)
                score = tf.layers.dense(layer3, 1)
                return score
        
        scope = str(uuid.uuid4())
        scorer1 = get_output_tensor(scope, inputs)
        scorer2 = get_output_tensor(scope, false_inputs)

        loss = tf.reduce_mean(tf.maximum(tf.zeros_like(scorer1), 1 - (scorer1 - scorer2)))
        optimizer = tf.train.GradientDescentOptimizer(1e-2).minimize(loss)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        
        def chunk_once(doc, width):
            chunks = [doc[i:i+width] for i in range(0, len(doc), width)]
            chunks[-1] += [0] * (width - len(chunks[-1]))
            return chunks

        def chunk_and_padd(documents, width):
            chunks = [w for d in documents for w in chunk_once(d, width)]
            return chunks
        
        # kroi tablice na podtablice długości n
        dataset = chunk_and_padd(train, self.sentence_length)

        def noiser(sentence, p):
            new_sent = [x for x in sentence]
            for i, _ in enumerate(new_sent):
                if np.random.random() < p:
                    new_sent[i] = np.random.choice(NUM_WORDS-1)+1
            return new_sent
        epochs = self.epochs
        batch_size = 256
        num_batches = len(dataset)//batch_size

        for epoch in range(epochs):
            #print('epoch: ' + str(epoch + 1) + "/" + str(epochs) )
            for _ in range(num_batches):
                batch_indexes = np.random.choice(len(dataset),batch_size)
                batch = [dataset[i] for i in batch_indexes]
                false_batch = [noiser(dataset[i],0.2) for i in batch_indexes]
                sess.run(optimizer,feed_dict={inputs:batch, false_inputs:false_batch})

        matrix = sess.run(embedding_matrix)
        self.embedding = matrix
        pass

    def embed(self, sentence):
        vectorized = self.vectorizer.transform([sentence]).toarray()[0]
        result = vectorized.dot(self.embedding)
        return result
        pass


In [None]:

from sklearn.model_selection import GridSearchCV


parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}


class Tester(object):
    
    def __init__(self):
        self.params = {}
        
    
    def fit(self, X, Y):
        self.model = LogisticRegressionModel(Embedder(**self.params))
        self.model.fit(X, Y)
        pass
    
    def score(self, X, Y):
        score = self.model.score(X, Y)
        print(score, self.get_params)
        return score
        pass 
    
    def set_params(self, **params):
        self.params = params
        pass
    
    def get_params(self, deep=True):
        return self.params

model = Tester()
parameters = {
    'sentence_length': [20, 30, 50],
    'embed_dimension': [10, 50, 100],
    'l1_fil': [16, 32, 64, 128],
    'l1_ker': [3, 5, 7, 9]
}
clf = GridSearchCV(model, parameters, verbose=9999)
print('Fit start')
clf.fit(X, Y)
print('Fit done')

print('Najlepsze parametry:')
print(clf.best_params_)

Fit start
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] embed_dimension=10, l1_fil=16, l1_ker=3, sentence_length=20 .....
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Instructions for updating:
Use the retry module or similar alternatives.
0.424821897263 <bound method Tester.get_params of <__main__.Tester object at 0x000001C685672198>>
0.44196512282 <bound method Tester.get_params of <__main__.Tester object at 0x000001C685672198>>
[CV]  embed_dimension=10, l1_fil=16, l1_ker=3, sentence_length=20, score=0.42482189726284214, total= 2.4min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s
[CV] embed_dimension=10, l1_fil=16, l1_ker=3, sentence_length=20 .....
0.494188226472 <bound method Tester.get_params of <__main__.Tester object at 0x000001C69744A320>>
0.485280330021 <bound method Tester.get_params of <__main__.Tester object at 0x000001C69744A320>>
[CV]  embed_dimension=10, l1_fil=16, l1_ker=3, sen

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test =train_test_split(X, Y, test_size=0.3)

model = LogisticRegressionModel(Embedder())
model.fit(X_train, Y_train)
print(model.score(X_train, Y_train))
print(model.score(X_test, Y_test))

0.659821428571
0.616666666667
