# Save and read embeddings len

In [None]:
import pickle
with open('len_file', 'wb') as fp:
    pickle.dump(sen_len, fp)

with open ('len_file', 'rb') as fp:
    sen_len = pickle.load(fp)

`max(sen_len)`: 62

# Read labels

In [None]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('/u/shawnlyu/projects/linguistics/workdir/cleaned_data/se100_newlineremoved_text',index_col=0)
labels = list(df_train.target.values)

`len(labels)`: 1, 044, 839

# Read embeddings

### Load embeddings

In [None]:
import h5py
DATA_ROOT = '/u/shawnlyu/projects/linguistics/workdir/embeddings/elmo_layers.train.hdf5'
with h5py.File(DATA_ROOT,'r') as hf:
    dataset_names = list(hf.keys())

`len(dataset_names)`: 1, 044, 840

### How to access embeddings

`h5py_file.get("0")` 0 - 1, 044, 838 for embeddings of each sentences    
`h5py_file.get("sentence_to_index")` for sentence and index relations (str)

In [None]:
h5py_file = h5py.File(DATA_ROOT, 'r')
sen_len = []
for i in range(1044839):
    if i % 10000 == 0:
        print(i)
    sen_len.append(h5py_file.get(str(i)).shape[0])

# Implement CNN-LSTM

## Configs

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    # information about dataset
    training_size=1044839,
    word_embeddings_dim=512,
    sen_len=65, # this should be larger than max-sen-len declared at the top
    
    # pytorch settings    
    seed=1,
    
    # neural nets params
    dataLoader={
    'batch_size': 128,
    'shuffle': True,
    'num_workers': 20,
    },
    lr=3e-4,
    epochs=10,
    hidden_sz=64,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(config.seed)

## Implement dataloader for sequential loading

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, list_IDs, labels):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]

        # Load data and get label
        X = h5py_file.get(ID)
        y = self.labels[ID]

        return X, y

In [None]:
from torch.utils import data
# Generators
IDs = [str(i) for i in range(config.training_size)]
training_set = Dataset(IDs, labels)
training_generator = data.DataLoader(training_set, **config.params)

for epoch in range(config.epochs):
    # Training
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

In [None]:
import torch
import torch.nn.functional as F     # functions

Set random seed manually to replicate results

In [None]:
class CNN_LSTM(torch.nn.Module):
    def __init__(self,input_size,hidden_dim):
        super(CNN_LSTM,self).__init__()
        self.input_dim = input_size
        self.hidden_dim = hidden_dim
        self.conv1 = nn.Conv2d(1, 10, kernel_size=(config.sen_len,config.word_embeddings_dim))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(config.sen_len,config.word_embeddings_dim))
        self.lstm = nn.LSTM(input_size,hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim,1)
        self.hidden = self.init_hidden()
 
    def init_hidden(self):
        return (Variable(torch.zeros(1, 1, self.hidden_dim)),
                Variable(torch.zeros(1, 1, self.hidden_dim)))
 
    def forward(self,seq):
        lstm_out, self.hidden = self.lstm(
            seq.view(len(seq), 1, -1), self.hidden)
        outdat = self.hidden2out(lstm_out.view(len(seq),-1))


net = Net(n_feature=1, n_hidden=10, n_output=1)

print(net)
"""
Net (
  (hidden): Linear (1 -> 10)
  (predict): Linear (10 -> 1)
)
"""

 
