In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from sklearn.model_selection import train_test_split
import time

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv("/content/drive/MyDrive/github-issue-bot/saif_processed.csv")

In [5]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for idx, data in df.iterrows():
  counter.update(tokenizer(data["issue"]))
counter.update(["_PAD", "_UNK"])
vocab = Vocab(counter, min_freq=1)

In [21]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [74]:
dataset = df[["issue", "label"]].copy(deep=True)
dataset["issue"] = dataset["issue"].apply(lambda x: np.str_(x))
dataset["issue"] = dataset["issue"].apply(lambda x: text_pipeline(x))

In [97]:
def pad_input(sentences, seq_len=300):
  features = np.zeros((len(sentences), seq_len), dtype=int)
  for ii, review in enumerate(sentences):
    if len(review) != 0:
      features[ii, -len(review):] = np.array(review)[:seq_len]
  return features

In [143]:
train_data, test_data = train_test_split(dataset, test_size=0.15)
train_data, valid_data = train_test_split(train_data, test_size=0.05)
train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

In [230]:
class LSTMClassifier(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, n_layers=1):
    super(LSTMClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, num_class)
    self.dropout = nn.Dropout(0.5)
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, x, hidden):
    x = self.embedding(x)
    x = self.dropout(x)
    lstm_out, (ht, ct) = self.lstm(x)
    out = self.fc(ht[-1])
    return out

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
    return hidden

In [231]:
def train(dataloader, criterion, h):
  model.train()
  total_acc, total_count = 0, 0
  log_interval = 50
  start_time = time.time()

  for idx, (text, label) in enumerate(dataloader):
    h = tuple([e.data for e in h])
    # forward prop
    predicted_label = model(text, h)

    # calculate loss
    loss = criterion(predicted_label, label)

    # backward propagation
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()


    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)


    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches '
            '| accuracy {:8.3f} | time {:5.3f}'.format(epoch, idx, len(dataloader),
                                        total_acc/total_count, elapsed))
      total_acc, total_count = 0, 0
      start_time = time.time()

def evaluate(dataloader, criterion, h):
  model.eval()
  total_acc, total_count = 0, 0

  # don't calculate backward gradients
  with torch.no_grad():
    for idx, (text, label) in enumerate(dataloader):
      predicted_label = model(text, h)
      loss = criterion(predicted_label, label)
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)
  return total_acc/total_count

In [145]:
train_label = train_data["label"].copy(deep=True)
valid_label = valid_data["label"].copy(deep=True)
test_label = test_data["label"].copy(deep=True)

In [146]:
train_data = pad_input(train_data["issue"].values)
valid_data = pad_input(valid_data["issue"].values)
test_data = pad_input(test_data["issue"].values)

In [147]:
train_data = TensorDataset(torch.from_numpy(train_data).to(device), torch.from_numpy(train_label.values).to(device))
valid_data = TensorDataset(torch.from_numpy(valid_data).to(device), torch.from_numpy(valid_label.values).to(device))
test_data = TensorDataset(torch.from_numpy(test_data).to(device), torch.from_numpy(test_label.values).to(device))

In [232]:
num_class = len(dataset["label"].unique())
vocab_size = len(vocab)
emsize = 64
hidden_dim = 512
model = LSTMClassifier(vocab_size, emsize, hidden_dim, num_class, n_layers=1).to(device)

In [237]:
# Hyperparameters
EPOCHS = 10
LR = 0.01  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    h = model.init_hidden(BATCH_SIZE)
    train(train_dataloader, criterion, h)
    h = model.init_hidden(BATCH_SIZE)
    accu_val = evaluate(valid_dataloader, criterion, h)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 65)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 65)

| epoch   1 |    50/  247 batches | accuracy    0.761 | time 2.993
| epoch   1 |   100/  247 batches | accuracy    0.823 | time 2.855
| epoch   1 |   150/  247 batches | accuracy    0.798 | time 2.862
| epoch   1 |   200/  247 batches | accuracy    0.782 | time 2.915
-----------------------------------------------------------------
| end of epoch   1 | time: 14.60s | valid accuracy    0.641 
-----------------------------------------------------------------
| epoch   2 |    50/  247 batches | accuracy    0.871 | time 3.096
| epoch   2 |   100/  247 batches | accuracy    0.866 | time 3.053
| epoch   2 |   150/  247 batches | accuracy    0.868 | time 3.050
| epoch   2 |   200/  247 batches | accuracy    0.865 | time 3.048
-----------------------------------------------------------------
| end of epoch   2 | time: 15.28s | valid accuracy    0.669 
-----------------------------------------------------------------
| epoch   3 |    50/  247 batches | accuracy    0.929 | time 3.092
| epoch   3

In [238]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader, criterion, model.init_hidden(BATCH_SIZE))
print('test accuracy {:8.2f}'.format(accu_test*100))

Checking the results of test dataset.
test accuracy    63.64


In [207]:
torch.save(model, '/content/drive/MyDrive/github-issue-bot/models/lstm.pth')