In [8]:
import torch
import torch.utils.data as data
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import nltk
import os
import bs4
import random

torch.manual_seed(12345)
random.seed(12345)

nltk.data.path.append("/home/samuel/Programming/data/nltk_data")

In [25]:
def concat_contractions(tokens):
    contractions = set(["'ve", "'d", "'m", "'ll", "'re", "n't"])
    return ["".join(tokens[i]) if (i+1 == len(tokens) or tokens[i+1] not in contractions) else "".join(tokens[(i):(i+2)]) for i in range(len(tokens)) if tokens[i] not in contractions]

def data_processing(ds_paths, max_len=500, split_ratio=1.0):
    ds = []
    for i, tfp in enumerate(ds_paths):
        idx, rating = os.path.basename(tfp).split(".")[0].split("_")
        with open(tfp, "r") as f:
            raw = f.readlines()
            raw = bs4.BeautifulSoup(raw[0], "html5lib")
            txt = raw.get_text(separator=' ')
            tokens = nltk.word_tokenize(txt)
            tokens = concat_contractions(tokens)
            #tokens = [vocab[w] if w in vocab else len(vocab) for w in tokens] # keep out of vocab
            tokens = [vocab[w] for w in tokens if w in vocab]
            if len(tokens) > max_len:
                tokens = tokens[:max_len]
            elif len(tokens) < max_len:
                tokens = tokens + [0]*(max_len-len(tokens))
            ds.append((tokens, int(rating)))
    dat, labels = zip(*ds)
    assert split_ratio >= 0. and split_ratio <= 1.0
    if split_ratio == 1.:
        return (dat, labels), (None, None)
    else:
        split_idx = int(len(dat) * split_ratio)
        tidx = list(range(len(dat)))
        random.shuffle(tidx)
        tidx, vidx = tidx[:split_idx], tidx[split_idx:]
        ts, ts_labels = [dat[tid] for tid in tidx], [labels[tid] for tid in tidx]
        vs, vs_labels = [dat[vid] for vid in vidx], [labels[vid] for vid in vidx]
        return (ts, ts_labels), (vs, vs_labels)


In [5]:
IMDB_BASEDIR = "/home/samuel/Programming/data/aclImdb"
%ls $IMDB_BASEDIR
train_paths = sorted([f.path for d in ["pos", "neg"] for f in os.scandir(os.path.join(IMDB_BASEDIR, "train", d))])
test_paths = sorted([f.path for d in ["pos", "neg"] for f in os.scandir(os.path.join(IMDB_BASEDIR, "test", d))])

train_paths[:5]

imdbEr.txt  imdb.vocab  README  [0m[01;34mtest[0m/  [01;34mtrain[0m/


['/home/david/Programming/data/aclImdb/train/neg/0_3.txt',
 '/home/david/Programming/data/aclImdb/train/neg/10000_4.txt',
 '/home/david/Programming/data/aclImdb/train/neg/10001_4.txt',
 '/home/david/Programming/data/aclImdb/train/neg/10002_1.txt',
 '/home/david/Programming/data/aclImdb/train/neg/10003_1.txt']

In [6]:
vocab_limit = 5000
with open(os.path.join(IMDB_BASEDIR, "imdb.vocab"), "r") as f:
    vocab = {w:(i+1) for i, w in enumerate([l.strip() for l in f.readlines()][:vocab_limit])}


In [26]:

trainset, validset = data_processing(train_paths, split_ratio = 0.9)
print(len(trainset[0][0]), len(validset[0][0]))
ts, ts_labels = torch.Tensor(trainset[0]).long(), torch.Tensor(trainset[1])
ts_labels = (ts_labels > 5).float()
dts = data.TensorDataset(ts, ts_labels)
dlts = data.DataLoader(dts, batch_size=100)

500 500


In [33]:
vs, vs_labels = torch.Tensor(validset[0]).long(), torch.Tensor(validset[1])
vs_labels = (vs_labels > 5).float()
dvs = data.TensorDataset(vs, vs_labels)
dlvs = data.DataLoader(dvs, batch_size=100)
print(len(vs))

2500


In [12]:
#split_ratio = 0.9
#split_idx = int(len(trainset) * split_ratio)
#tidx = list(range(len(trainset)))
#random.shuffle(tidx)
#tidx, vidx = tidx[:split_idx], tidx[split_idx:]
#ts, ts_labels = [trainset[tid] for tid in tidx], [train_labels[tid] for tid in tidx]
#vs, vs_labels = [trainset[vid] for vid in vidx], [train_labels[vid] for vid in vidx]

#ts, ts_labels = torch.Tensor(ts).long(), torch.Tensor(ts_labels)
#ts_labels = (ts_labels > 5).float()
#dts = data.TensorDataset(ts, ts_labels)
#dlts = data.DataLoader(dts, batch_size=100)

25000 2


In [60]:
class SingleHiddenNN(nn.Module):
    def __init__(self, vocab_size, max_len, embed_elems, batch_size):
        super(SingleHiddenNN, self).__init__()
        self.vocab_size = vocab_size
        self.embed_elems = embed_elems
        self.max_len = max_len
        self.emb = nn.Embedding(self.vocab_size+1, self.embed_elems)
        self.fc = nn.Linear(int(self.max_len * self.embed_elems), 100)
        self.relu = nn.SELU()
        self.dropout = nn.Dropout(0.7)
        self.out = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, input):
        x = self.emb(input)
        x = x.view(input.size(0), -1)
        x = self.fc(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out(x)
        x = self.sigmoid(x)
        return x.view(-1)

print(ts.min(), ts.max())
print(ts.size())
model = SingleHiddenNN(len(vocab), 500, 32, 100)
print(model)
criterion = nn.BCELoss()
optimizer = []
optimizer += [torch.optim.Adam(model.parameters(), lr=0.0001)]
optimizer += [torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)]
epochs = 250
for epoch in range(epochs):
    model.train()
    running_loss = 0
    for i, (mb, tgts) in enumerate(dlts):
        model.zero_grad()
        mb, tgts = torch.autograd.Variable(mb), torch.autograd.Variable(tgts.float())
        out = model(mb)
        loss = criterion(out, tgts)
        loss.backward()
        opt_idx = epoch % 2
        optimizer[opt_idx].step()
        running_loss += loss.data[0]
    print("epoch {} had a loss of {:.5}:".format(epoch+1, running_loss))
    if epoch > 0 and epoch % 5 == 0:
        model.eval()
        correct = 0
        for vmb, vtgts in dlvs:
            vmb, vtgts = torch.autograd.Variable(vmb), torch.autograd.Variable(vtgts.float())
            vout = model(vmb)
            vpred = vout.round()
            correct += (vpred == vtgts).data.sum()
        print("correct: {}, total: {}".format(correct, len(vs)))
        print("validation accuracy: {:.2f}".format(100.*correct/len(vs)))


0 5000
torch.Size([22500, 500])
SingleHiddenNN (
  (emb): Embedding(5001, 32)
  (fc): Linear (16000 -> 100)
  (relu): SELU
  (dropout): Dropout (p = 0.7)
  (out): Linear (100 -> 1)
  (sigmoid): Sigmoid ()
)
epoch 1 had a loss of 180.21:
epoch 2 had a loss of 149.41:
epoch 3 had a loss of 157.26:
epoch 4 had a loss of 136.89:
epoch 5 had a loss of 146.05:
epoch 6 had a loss of 128.16:
correct: 1506, total: 2500
validation accuracy: 60.24
epoch 7 had a loss of 138.26:
epoch 8 had a loss of 122.53:
epoch 9 had a loss of 131.25:
epoch 10 had a loss of 116.95:
epoch 11 had a loss of 125.69:
correct: 1515, total: 2500
validation accuracy: 60.60
epoch 12 had a loss of 113.37:
epoch 13 had a loss of 120.07:
epoch 14 had a loss of 106.46:
epoch 15 had a loss of 116.02:
epoch 16 had a loss of 102.44:
correct: 1567, total: 2500
validation accuracy: 62.68
epoch 17 had a loss of 111.7:
epoch 18 had a loss of 99.776:
epoch 19 had a loss of 109.25:
epoch 20 had a loss of 96.52:
epoch 21 had a loss of

epoch 192 had a loss of 13.577:
epoch 193 had a loss of 13.533:
epoch 194 had a loss of 13.713:
epoch 195 had a loss of 12.667:
epoch 196 had a loss of 11.576:
correct: 1915, total: 2500
validation accuracy: 76.60
epoch 197 had a loss of 12.638:
epoch 198 had a loss of 10.731:
epoch 199 had a loss of 12.506:
epoch 200 had a loss of 10.383:
epoch 201 had a loss of 12.581:
correct: 1844, total: 2500
validation accuracy: 73.76
epoch 202 had a loss of 12.099:
epoch 203 had a loss of 13.259:
epoch 204 had a loss of 10.19:
epoch 205 had a loss of 14.359:
epoch 206 had a loss of 10.284:
correct: 1917, total: 2500
validation accuracy: 76.68
epoch 207 had a loss of 14.6:
epoch 208 had a loss of 10.931:
epoch 209 had a loss of 12.596:
epoch 210 had a loss of 10.348:
epoch 211 had a loss of 11.99:
correct: 1856, total: 2500
validation accuracy: 74.24
epoch 212 had a loss of 11.241:
epoch 213 had a loss of 11.915:
epoch 214 had a loss of 11.312:
epoch 215 had a loss of 10.298:
epoch 216 had a loss

In [56]:
correct = 0
for i, (mb, tgts) in enumerate(dlvs):
    mb, tgts = torch.autograd.Variable(mb), torch.autograd.Variable(tgts.float())
    out = model(mb)
    pred = out.round()
    correct += (pred == tgts).data.sum()
print(correct, correct / len(vs), len(vs))
print(torch.stack((pred.data, tgts.data), 1))

2019 0.8076 2500

    0     0
    1     1
    0     0
    0     1
    1     1
    0     0
    0     0
    1     0
    0     0
    1     1
    0     0
    1     0
    1     1
    0     0
    1     0
    0     0
    0     0
    1     0
    0     0
    1     0
    0     0
    1     1
    0     1
    1     1
    0     0
    0     0
    1     1
    0     1
    1     1
    1     1
    1     0
    1     1
    0     0
    0     0
    0     0
    0     0
    0     1
    0     1
    1     1
    1     0
    0     0
    1     1
    1     0
    1     1
    0     1
    0     0
    1     1
    0     1
    0     0
    0     0
    0     0
    1     1
    0     0
    1     1
    0     0
    0     0
    0     1
    1     1
    1     1
    1     1
    1     0
    0     0
    1     1
    0     0
    0     0
    0     0
    0     0
    1     0
    1     1
    0     0
    0     0
    0     0
    1     1
    1     1
    1     1
    1     1
    0     1
    0     0
    1     1
    0     1
    0     0
    0     

In [53]:
torch.save(model.state_dict(), "model_imdb_20170912.pt")

## torchtext

In [None]:
import torchtext
import torchtext.data as ttdata
TEXT = ttdata.Field()
LABEL = ttdata.Field(sequential=False)
imdb_ds = torchtext.datasets.IMDB("/home/samuel/imdb_sentiment/data", TEXT, LABEL)
train_iter, test_iter = imdb_ds.iters(batch_size=4, device=-1)

In [None]:
train_iter, test_iter = imdb_ds.iters(batch_size=25, device=-1)
for x in train_iter:
    print(x.text, x.label.size())
    break
    

In [52]:
%ls

aae_supervised.py            notes.md
[0m[01;34malgore[0m/                      numpy_reshape_test.ipynb
AlGore_2009.sph              pad_test.py
AlGore_2009.stm              [01;34mpcsnpny-20150204-mkj[0m/
audio_rnn_basic.ipynb        [01;31mpcsnpny-20150204-mkj.tgz[0m
[01;35mclipmin.png[0m                  [00;36mpiano2.mp3[0m
CNN2RNN.ipynb                [00;36mpiano.mp3[0m
collate_variable.py          [00;36mpiano_new.wav[0m
[01;34mdata[0m/                        playground.ipynb
[01;31mdata.zip[0m                     predict_audio.ipynb
deepspeech1d.ipynb           Presentation.ipynb
denoising_autoencoder.ipynb  prime_factors.py
extract_mnist.py             pyaudio-test.py
[00;36mfile2.wav[0m                    [01;34m__pycache__[0m/
[00;36mfile.flac[0m                    pytorch_basics.ipynb
[00;36mfile.mp3[0m                     PyTorch Embeddings Test.ipynb
[00;36mfile.wav[0m                     pytorch_tutorial_classify_names.ipynb