In [1]:
from sklearn.linear_model import LogisticRegression

def load_data(filename):
    with open(filename) as opened_file:
        data = [tuple(line.split("\t")) for line in opened_file]
    return [datum[0] for datum in data], [int(datum[1]) for datum in data]

class LogisticRegressionModel(object):
    def __init__(self, embedder):
        # embedder to bedzie klasa ktora przez was bedzie napisana
        self.embedder = embedder
        self.model = LogisticRegression()

    def fit(self, X, Y):
        # tutaj nastepuje uczenie embeddingu
        self.embedder.train_embeddings(X)
        embedded = [self.embedder.embed(x) for x in X]
        # upewnienie sie ze embedding ma staly wymiar.
        # Nie przejscie tej asercji oznacza niezaliczenie zadania
        assert(len(set(len(embedding) for embedding in embedded))==1)
        self.model.fit(embedded, Y)

    def predict(self, X):
        embedded = [self.embedder.embed(x) for x in X]
        # j.w.
        assert(len(set(len(embedding) for embedding in embedded))==1)
        return self.model.predict(embedded)

    def score(self, X_test, Y_test):
        assert(len(X_test)==len(Y_test))
        predictions = self.predict(X_test)
        matching = sum(y1==y2 for y1, y2 in zip(predictions, Y_test))
        return matching/len(Y_test)


In [2]:
X, Y = load_data('train_data')

In [3]:
import tensorflow as tf
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer


class Embedder(object):
    def __init__(self):
        pass
    
    def train_embeddings(self, data):
        embed_dimension = 100
        self.vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             lowercase = True,
                             stop_words = None) 
        features = self.vectorizer.fit_transform(data).toarray()
        return features
        
        pass
    def embed(self, sentence):   
        res = self.vectorizer.transform([sentence]).toarray()
        return res[0]
        pass

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test =train_test_split(X, Y, test_size=0.4)

model = LogisticRegressionModel(Embedder())
model.fit(X_train, Y_train)

print(model.score(X_train, Y_train))
print(model.score(X_test, Y_test))

0.94125
0.710625
