# Imports

In [1]:
import csv
import re
import sys

from functools import reduce
from os import listdir, sep
from os.path import isfile
from random import shuffle

import torch
import numpy as np

from gensim.models import Word2Vec, KeyedVectors
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim

# NN Definition

In [11]:
class Entwork(nn.Module):
    def __init__(self):
        super(Entwork, self).__init__()
        self.cv1 = nn.Conv2d(1, 6, (3, 20))
        self.cv2 = nn.Conv2d(6, 16, (3, 20))
        self.fc1 = nn.Linear(4992, 500)
        self.fc2 = nn.Linear(500, 50)
        self.fc3 = nn.Linear(50, 5)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.cv1(x)), (3, 3))
        x = F.max_pool2d(F.relu(self.cv2(x)), (3, 3))
        x = x.view(-1, reduce(lambda z, y: z * y, x.size()[1:], 1))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=1)

# Data import

In [5]:
digit = re.compile(r"\d")
splitter = re.compile(r"\s+")
punctuation = re.compile(r"((?<!\d)[.,](?!\d)|[\n?!:;\()/\\\-_=*])")

In [6]:
def read_sentences(p):
    with open(p) as infile:
        reader = csv.reader(infile)
        for line in reader:
            yield line[0], splitter.split(punctuation.sub(" ", digit.sub("#", line[1].lower())))

In [7]:
ls = []
csv_path = "/home/local/saska/Documents/rev"
for p in listdir(csv_path):
    ap = list(read_sentences("{}{}{}".format(csv_path, sep, p)))
    print("read {} revies from {}".format(len(ap), p))
    ls.extend(ap)

read 3543 revies from Kaapelit.csv
read 1927 revies from Koti-ja-valaistus.csv
read 1976 revies from Kodinkoneet.csv
read 901 revies from Kellot.csv
read 7795 revies from Puhelimet.csv
read 4709 revies from Kamerat.csv
read 1091 revies from Lelut.csv
read 5019 revies from Audio-ja-hifi.csv
read 3251 revies from Laukut-ja-matkailu.csv
read 1308 revies from Grillaus-ja-kokkaus.csv
read 3428 revies from Tarvike-ja-toimisto.csv
read 3706 revies from Komponentit.csv
read 1768 revies from Musiikki.csv
read 2177 revies from Verkko.csv
read 451 revies from Lemmikit.csv
read 2529 revies from TV-ja-video.csv
read 3281 revies from Pelit-ja-viihde.csv
read 873 revies from Ohjelmistot.csv
read 5931 revies from Tietokoneet.csv
read 4186 revies from Pienkoneet.csv
read 5517 revies from Oheislaitteet.csv
read 1150 revies from Vauvat-ja-perhe.csv
read 2306 revies from Urheilu.csv
read 1482 revies from Ruoka-ja-juoma.csv


# Store to file for fasttext

In [8]:
with open("/home/local/saska/Documents/fastt/lines", 'w') as out_file:
    out_file.write("\n".join([" ".join(line[1]) for line in ls]))

s = set()
for line in ls:
    for w in line[1]:
        s.add(w)

with open("/home/local/saska/Documents/fastt/queries", 'w') as out_file:
    out_file.write("\n".join(s))

```
fastText-0.1.0/fasttext skipgram -input lines -output model -dim 200
fastText-0.1.0/fasttext print-word-vectors model.bin < queries > vecs.txt
```
# Tensor length

In [9]:
d = {}
for review in ls:
    if len(review[1]) in d:
        d[len(review[1])] += 1
    else:
        d[len(review[1])] = 1
        
l = 0
count = 0
while count < 0.99 * len(ls):
    if l in d:
        count += d[l]
    l += 1
    
print("{} % of reviews are of lenght {} or shorter".format(100 * count/len(ls), l))

99.00718298840765 % of reviews are of lenght 232 or shorter


# Create sentence tensors

In [10]:
d = {}
fuu = False
with open("/home/local/saska/Documents/fastt/vecs.txt") as in_file:
    rd = csv.reader(in_file, delimiter=" ", quotechar="¤")
    for line in rd:      
        if len(line) > 2:
            d[line[0]] = np.asarray(line[1:-1], dtype=np.float32)

print(len(d))

175362


In [14]:
print(len(d["hyvä"]))

200


In [15]:
def get_sentence_tensor(sentence, embeddings, block_length):
    return torch.stack([torch.stack([get_embedding(i, sentence, embeddings) for i in range(block_length)])])

def get_embedding(idx, sentence, embeddings):
    return (torch.from_numpy(embeddings[sentence[idx]])
            if len(sentence) > idx and sentence[idx] in embeddings
            else torch.zeros(200))

In [23]:
block_len = 230
tensor_refs = []
'''for i, review in enumerate(ls):
    nam = "tensors/{}.pt".format(i)
    tensor_refs.append((int(review[0]) - 1, nam))
    torch.save(get_sentence_tensor(review[1], d, block_len), nam)'''
with open("meta.txt") as in_file:
    csv_reader = csv.reader(in_file)
    for i, l, f in csv_reader:
        tensor_refs.append((int(i), int(l), f))

In [24]:
class SentenceLoader(Dataset):
    def __init__(self, metadata):
        self.metadata = metadata

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        ten = torch.load(self.metadata[idx][2])
        return {'lable': self.metadata[idx][1], 'ten': ten}

In [25]:
shuffle(tensor_refs)
testDataLoader = DataLoader(SentenceLoader(tensor_refs[:300]), shuffle=False, batch_size=100)
trainDataLoader = DataLoader(SentenceLoader(tensor_refs[300:]), shuffle=True, batch_size=100)

# Train
### Clear memory

In [26]:
del(d)
del(ls)

### Do training

In [28]:
ent = Entwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(ent.parameters(), lr=0.001, momentum=0.9)
print("Training...")
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(trainDataLoader):
        labels, inputs = data["lable"], data["ten"]
        inputs, labels = Variable(inputs), Variable(labels)
        optimizer.zero_grad()
        outs = ent(inputs)
        loss = criterion(outs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.data[0]
    print("epoch {} loss: {}".format(epoch, running_loss))


Training...
epoch 0 loss: 1094.6773493289948
epoch 1 loss: 989.3744008541107
epoch 2 loss: 984.8980747461319
epoch 3 loss: 984.2620143890381
epoch 4 loss: 983.8540791273117
epoch 5 loss: 983.6242444515228
epoch 6 loss: 983.7645877599716
epoch 7 loss: 983.2722520828247
epoch 8 loss: 983.5745230913162
epoch 9 loss: 983.0713748931885
Evaluating model accuracy...


RuntimeError: Variable data has to be a tensor, but got str

In [29]:
print("Evaluating model accuracy...")
c = 0
for i, data in enumerate(testDataLoader):
    labels, inputs = data["lable"], data["ten"]
    outs = ent(Variable(inputs))
    _, preds = torch.max(outs.data, 1)
    for i in range(len(labels)):
        if labels[i] == preds[i]:
            c += 1
        print("Actual: {}, predicted: {}".format(labels[i], preds[i]))
print("Accuracy: {}".format(c / 300))

Evaluating model accuracy...
Actual: 4, predicted: 3
Actual: 4, predicted: 4
Actual: 3, predicted: 4
Actual: 3, predicted: 4
Actual: 3, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 3, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 3, predicted: 4
Actual: 2, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 3, predicted: 4
Actual: 4, predicted: 4
Actual: 0, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 3, predicted: 4
Actual: 4, predicted: 4
Actual: 2, predicted: 4
Actual: 2, predicted: 4
Actual: 3, predicted: 4
Actual: 4, predicted: 4
Actual: 2, predicted: 4
Actual: 4, predicted: 4
Actual: 1, predicted: 4
Actual: 2, predicted: 4
Actual: 4, predicted: 4
Actual: 0, predicted: 4
Actual: 4, predicted: 4
Actual: 4, predicted: 4
Actual: 2, predicted: 4
Actual: 4, predicted: 4
Actual: 2, predicted: 4
Actual: 4, predicted: 4
Actual: 4, 