<a href="https://colab.research.google.com/github/TIMEdilation584/JP_Loksatta_moving_hearts/blob/master/ELMO%20for%20text%20classification%20April%2019%202022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:

import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from data_pro import load_data_and_labels, Data
from model import Model
from config import opt


def now():
    return str(time.strftime('%Y-%m-%d %H:%M:%S'))


def collate_fn(batch):
    data, label = zip(*batch)
    return data, label


def train(**kwargs):

    opt.parse(kwargs)
    device = torch.device("cuda:{}".format(opt.gpu_id) if torch.cuda.is_available() else "cpu")
    opt.device = device

    random.seed(opt.seed)
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    if opt.use_gpu:
        torch.cuda.manual_seed_all(opt.seed)

    x_text, y = load_data_and_labels("./data/rt-polarity.pos", "./data/rt-polarity.neg")
    x_train, x_test, y_train, y_test = train_test_split(x_text, y, test_size=opt.test_size)

    train_data = Data(x_train, y_train)
    test_data = Data(x_test, y_test)
    train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, collate_fn=collate_fn)

    print(f"{now()} train data: {len(train_data)}, test data: {len(test_data)}")

    model = Model(opt)
    print(f"{now()} {opt.emb_method} init model finished")

    if opt.use_gpu:
        model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay)
    lr_sheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
    best_acc = -0.1
    best_epoch = -1
    start_time = time.time()
    for epoch in range(1, opt.epochs):
        total_loss = 0.0
        model.train()
        for step, batch_data in enumerate(train_loader):
            x, labels = batch_data
            labels = torch.LongTensor(labels)
            if opt.use_gpu:
                labels = labels.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        acc = test(model, test_loader)
        if acc > best_acc:
            best_acc = acc
            best_epoch = epoch
        print(f"{now()} Epoch{epoch}: loss: {total_loss}, test_acc: {acc}")
        lr_sheduler.step()

    end_time = time.time()
    print("*"*20)
    print(f"{now()} finished; epoch {best_epoch} best_acc: {best_acc}, time/epoch: {(end_time-start_time)/opt.epochs}")


def test(model, test_loader):
    correct = 0
    num = 0
    model.eval()
    with torch.no_grad():
        for data in test_loader:
            x, labels = data
            num += len(labels)
            output = model(x)
            labels = torch.LongTensor(labels)
            if opt.use_gpu:
                output = output.cpu()
            predict = torch.max(output.data, 1)[1]
            correct += (predict == labels).sum().item()
    model.train()
    return correct * 1.0 / num


if __name__ == "__main__":
    import fire
    fire.Fire()


ModuleNotFoundError: ignored

In [5]:

import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class Encoder(nn.Module):

    def __init__(self, enc_method, input_size, hidden_size, out_size):
        '''
        input_size
        hidden_size: the output size of CNN/RNN/TR
        outpu_size: the final size of the encoder (after pooling)
        w
        CNN:
        - filters_num: feature_dim
        - filter_size: 3
        - pooling: max_pooling
        RNN:
        - hidden_size: feature_dim // 2
        - pooling: last hidden status
        Transformer
        - nhead: 2
        - nlayer: 1
        - pooling: average
        -------
        '''
        super(Encoder, self).__init__()
        self.enc_method = enc_method.lower()
        if self.enc_method == 'cnn':
            self.conv = nn.Conv2d(1, hidden_size, (3, input_size))
            nn.init.xavier_uniform_(self.conv.weight)
            nn.init.constant_(self.conv.bias, 0.0)
            f_dim = hidden_size
        elif self.enc_method == 'rnn':
            self.rnn = nn.GRU(input_size, hidden_size//2, batch_first=True, bidirectional=True)
            f_dim = hidden_size
        elif self.enc_method == 'transformer':
            self.pe = PositionEmbedding(input_size, 512)
            self.layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=1)
            self.tr = nn.TransformerEncoder(self.layer, num_layers=1)
            f_dim = input_size
        else:
            f_dim = input_size

        self.fc = nn.Linear(f_dim, out_size)
        nn.init.uniform_(self.fc.weight, -0.5, 0.5)
        nn.init.uniform_(self.fc.bias, -0.1, 0.1)

    def forward(self, inputs):
        if self.enc_method == 'cnn':
            x = inputs.unsqueeze(1)
            x = F.relu(self.conv(x).squeeze(3))
            out = x.permute(0, 2, 1)
        elif self.enc_method == 'rnn':
            out, _ = self.rnn(inputs)
        elif self.enc_method == 'transformer':
            inputs = self.pe(inputs)
            out = self.tr(inputs.permute(1, 0, 2)).permute(1, 0, 2)
        else:
            out = inputs
        return self.fc(out.mean(1))


class PositionEmbedding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionEmbedding, self).__init__()
        self.pe = nn.Embedding(max_len, d_model)
        nn.init.uniform_(self.pe.weight, -0.1, 0.1)

    def forward(self, x):
        b, l, d = x.size()
        seq_len = torch.arange(l).to(x.device)
        return x + self.pe(seq_len).unsqueeze(0)


# performance poor
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x


In [12]:

from allennlp.modules.elmo import Elmo, batch_to_ids
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
import numpy as np

from encoder import Encoder


class Model(nn.Module):
    def __init__(self, opt):

        super(Model, self).__init__()
        self.opt = opt
        self.use_gpu = self.opt.use_gpu

        if opt.emb_method == 'elmo':
            self.init_elmo()
        elif self.opt.emb_method == 'glove':
            self.init_glove()
        elif self.opt.emb_method == 'bert':
            self.init_bert()

        self.encoder = Encoder(opt.enc_method, self.word_dim, opt.hidden_size, opt.out_size)
        self.cls = nn.Linear(opt.out_size, opt.num_labels)
        nn.init.uniform_(self.cls.weight, -0.1, 0.1)
        nn.init.uniform_(self.cls.bias, -0.1, 0.1)
        self.dropout = nn.Dropout(self.opt.dropout)

    def forward(self, x):
        if self.opt.emb_method == 'elmo':
            word_embs = self.get_elmo(x)
        elif self.opt.emb_method == 'glove':
            word_embs = self.get_glove(x)
        elif self.opt.emb_method == 'bert':
            word_embs = self.get_bert(x)

        x = self.encoder(word_embs)
        x = self.dropout(x)
        x = self.cls(x)    # batch_size * num_label
        return x

    def init_bert(self):
        '''
        initilize the Bert model
        '''
        self.bert_tokenizer = AutoTokenizer.from_pretrained(self.opt.bert_path)
        self.bert = AutoModel.from_pretrained(self.opt.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = False
        self.word_dim = self.opt.bert_dim

    def init_elmo(self):
        '''
        initilize the ELMo model
        '''
        self.elmo = Elmo(self.opt.elmo_options_file, self.opt.elmo_weight_file, 1)
        for param in self.elmo.parameters():
            param.requires_grad = False
        self.word_dim = self.opt.elmo_dim

    def init_glove(self):
        '''
        load the GloVe model
        '''
        self.word2id = np.load(self.opt.word2id_file, allow_pickle=True).tolist()
        self.glove = nn.Embedding(self.opt.vocab_size, self.opt.glove_dim)
        emb = torch.from_numpy(np.load(self.opt.glove_file, allow_pickle=True))
        if self.use_gpu:
            emb = emb.to(self.opt.device)
        self.glove.weight.data.copy_(emb)
        self.word_dim = self.opt.glove_dim

    def get_bert(self, sentence_lists):
        '''
        get the ELMo word embedding vectors for a sentences
        '''
        sentence_lists = [' '.join(x) for x in sentence_lists]
        ids = self.bert_tokenizer(sentence_lists, padding=True, return_tensors="pt")
        inputs = ids['input_ids']
        if self.opt.use_gpu:
            inputs = inputs.to(self.opt.device)

        embeddings = self.bert(inputs)
        return embeddings[0]

    def get_elmo(self, sentence_lists):
        '''
        get the ELMo word embedding vectors for a sentences
        '''
        character_ids = batch_to_ids(sentence_lists)
        if self.opt.use_gpu:
            character_ids = character_ids.to(self.opt.device)
        embeddings = self.elmo(character_ids)
        return embeddings['elmo_representations'][0]

    def get_glove(self, sentence_lists):
        '''
        get the glove word embedding vectors for a sentences
        '''
        max_len = max(map(lambda x: len(x), sentence_lists))
        sentence_lists = list(map(lambda x: list(map(lambda w: self.word2id.get(w, 0), x)), sentence_lists))
        sentence_lists = list(map(lambda x: x + [self.opt.vocab_size-1] * (max_len - len(x)), sentence_lists))
        sentence_lists = torch.LongTensor(sentence_lists)
        if self.use_gpu:
            sentence_lists = sentence_lists.to(self.opt.device)
        embeddings = self.glove(sentence_lists)

        return embeddings


ModuleNotFoundError: ignored

In [13]:
python main.py train --emb_method='elmo' --enc_method='cnn'


SyntaxError: ignored

In [10]:

import re
import os
import sys
import numpy as np
import pickle

from torch.utils.data import Dataset


class Data(Dataset):
    def __init__(self, x, y):
        self.data = list(zip(x, y))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        assert idx < len(self)
        return self.data[idx]


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def extract_vocab(positive_data_file, negative_data_file):
    '''
    extract vocab from txt
    '''
    positive_examples = list(open(positive_data_file, "r", encoding='utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r", encoding='utf-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = list(map(lambda x: x.split(), x_text))

    vocab = []
    for line in x_text:
        vocab.extend(line)

    vocab = list(set(vocab))
    print("vocab size: {}.".format(len(vocab)))
    open("./data/glove/vocab.txt", "w").write("\n".join(vocab))


def get_glove(w2v_path, vocab_path):

    vocab = {j.strip(): i for i, j in enumerate(open(vocab_path), 0)}
    id2word = {vocab[i]: i for i in vocab}

    dim = 0
    w2v = {}
    for line in open(w2v_path):
        line = line.strip().split()
        word = line[0]
        vec = list(map(float, line[1:]))
        dim = len(vec)
        w2v[word] = vec

    vecs = []
    vecs.append(np.random.uniform(low=-1.0, high=1.0, size=dim))

    hit = 0
    for i in range(1, len(vocab) - 1):
        if id2word[i] in w2v:
            hit += 1
            vecs.append(w2v[id2word[i]])
        else:
            vecs.append(vecs[0])
    vecs.append(np.zeros(dim))
    assert(len(vecs) == len(vocab))

    print("vocab size: {}, dim: {}; hit in glove:{}".format(len(vocab), dim, hit))
    np.save("./data/glove/glove_{}d.npy".format(dim), np.array(vecs, dtype=np.float32))
    np.save("./data/glove/word2id.npy", vocab)
    np.save("./data/glove/id2word.npy", id2word)


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r", encoding='utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r", encoding='utf-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = list(map(lambda x: x.split(), x_text))
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    y = np.array(positive_labels + negative_labels)
    return [x_text, y]


if __name__ == "__main__":
    import fire
    fire.Fire()


ModuleNotFoundError: ignored

In [9]:


class Config():

    # ELMo
    elmo_options_file = "./data/elmo/elmo_2x2048_256_2048cnn_1xhighway_options.json"
    elmo_weight_file = "./data/elmo/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
    elmo_dim = 512

    # Bert
    bert_path = './data/bert/'
    bert_dim = 768

    # glove
    vocab_size = 18766
    glove_dim = 300
    glove_file = "./data/glove/glove_300d.npy"
    word2id_file = "./data/glove/word2id.npy"

    emb_method = 'glove'  # bert/elmo/glove/
    enc_method = 'CNN'  # CNN/RNN/Transformer/mean
    hidden_size = 200
    out_size = 64
    num_labels = 2

    use_gpu = True
    seed = 2020
    gpu_id = 0

    dropout = 0.5
    epochs = 20

    test_size = 0.1
    lr = 1e-3
    weight_decay = 1e-4
    batch_size = 64
    device = "cuda:0"


def parse(self, kwargs):
    '''
    user can update the default hyperparamter
    '''
    for k, v in kwargs.items():
        if not hasattr(self, k):
            raise Exception('opt has No key: {}'.format(k))
        setattr(self, k, v)

    print('*************************************************')
    print('user config:')
    for k, v in self.__class__.__dict__.items():
        if not k.startswith('__'):
            print("{} => {}".format(k, getattr(self, k)))

    print('*************************************************')


Config.parse = parse
opt = Config()

In [11]:

from allennlp.commands.elmo import ElmoEmbedder
import numpy as np

#define max token length
max_tokens=60

#input sentences
sentences=["how are you doing","what is your name","can you subscribe to my channel"]

#create a pretrained elmo model (requires internet connection)
elmo = ElmoEmbedder(cuda_device=0)
embeddings=[]

#loop through the input sentences
for index,elmo_embedding in enumerate(elmo.embed_sentences(sentences)):  
    print("elmo:",index)
    # Average the 3 layers returned from Elmo
    avg_elmo_embedding = np.average(elmo_embedding, axis=0)
    padding_length = max_tokens - avg_elmo_embedding.shape[0]
    if(padding_length>0):
        avg_elmo_embedding =np.append(avg_elmo_embedding, np.zeros((padding_length, avg_elmo_embedding.shape[1])), axis=0)
    else:
        avg_elmo_embedding=avg_elmo_embedding[:max_tokens]
    embeddings.append(avg_elmo_embedding) 

ModuleNotFoundError: ignored