In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [82]:
import numpy as np
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from sklearn.model_selection import train_test_split
import time

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
df = pd.read_csv("/content/drive/MyDrive/github-issue-bot/saif_processed.csv")

In [12]:
df.head()

Unnamed: 0,issue,label,label_name
0,append new column neccesary bot append new col...,0,bug
1,student currentsemester would show sections st...,0,bug
2,duplicate articles toc causes weird behavior t...,0,bug
3,fix typo collection finder py summary describe...,0,bug
4,zimagi dbshell find psql moreover looks even p...,0,bug


In [141]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for idx, data in df.iterrows():
  counter.update(tokenizer(data["issue"]))
vocab = Vocab(counter, min_freq=1)

In [30]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [31]:
print(df["issue"][3])
print(text_pipeline(df["issue"][3]))

fix typo collection finder py summary describe change including rationale design decisions fixed minor typo alway always hint include fixes nnn fixing existing issue issue type pick one delete rest bugfix pull request component name write short name module plugin task feature additional information include additional information help people understand change step step reproduction problem helpful related issue paste verbatim command output e g change paste
[69, 1461, 633, 7992, 15, 262, 48, 37, 699, 2189, 113, 2137, 343, 675, 1461, 32637, 416, 1684, 156, 188, 2847, 1141, 196, 8, 8, 24, 1291, 52, 257, 569, 1609, 131, 27, 124, 17, 324, 793, 17, 78, 143, 200, 30, 112, 77, 156, 112, 77, 83, 645, 536, 37, 286, 286, 1619, 73, 741, 109, 8, 522, 2260, 115, 145, 11, 125, 37, 522]


In [142]:
dataset = df[["issue", "label"]].copy(deep=True)
dataset["issue"] = dataset["issue"].apply(lambda x: np.str_(x))

In [149]:
train_data, test_data = train_test_split(dataset, test_size=0.15)
train_data, valid_data = train_test_split(train_data, test_size=0.05)
train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

In [84]:
def collate_batch(batch):
  label_list, text_list, offsets = [], [], [0]
  for (_text, _label) in batch:
    label_list.append(_label)
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(device), text_list.to(device), offsets.to(device)    

In [144]:
class TextClassificationModel(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super(TextClassificationModel, self).__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.fc = nn.Linear(embed_dim, num_class)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [145]:
num_class = len(train_data["label"].unique())
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [146]:
def train(dataloader, criterion):
  model.train()
  total_acc, total_count = 0, 0
  log_interval = 50
  start_time = time.time()

  for idx, (label, text, offsets) in enumerate(dataloader):
      
    # forward prop
    predicted_label = model(text, offsets)

    # calculate loss
    loss = criterion(predicted_label, label)

    # backward propagation
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()


    total_acc += (predicted_label.argmax(1) == label).sum().item()
    total_count += label.size(0)


    if idx % log_interval == 0 and idx > 0:
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches '
            '| accuracy {:8.3f} | time {:5.3f}'.format(epoch, idx, len(dataloader),
                                        total_acc/total_count, elapsed))
      total_acc, total_count = 0, 0
      start_time = time.time()

def evaluate(dataloader, criterion):
  model.eval()
  total_acc, total_count = 0, 0

  # don't calculate backward gradients
  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
      predicted_label = model(text, offsets)
      loss = criterion(predicted_label, label)
      total_acc += (predicted_label.argmax(1) == label).sum().item()
      total_count += label.size(0)
  return total_acc/total_count

In [150]:
# Hyperparameters
EPOCHS = 10
LR = 1  # learning rate
BATCH_SIZE = 64 # batch size for training
  
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None

train_dataloader = DataLoader(train_data.values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data.values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data.values, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, criterion)
    accu_val = evaluate(valid_dataloader, criterion)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 65)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 65)

| epoch   1 |    50/  247 batches | accuracy    0.717 | time 0.328
| epoch   1 |   100/  247 batches | accuracy    0.719 | time 0.333
| epoch   1 |   150/  247 batches | accuracy    0.717 | time 0.338
| epoch   1 |   200/  247 batches | accuracy    0.716 | time 0.332
-----------------------------------------------------------------
| end of epoch   1 | time:  1.70s | valid accuracy    0.708 
-----------------------------------------------------------------
| epoch   2 |    50/  247 batches | accuracy    0.722 | time 0.345
| epoch   2 |   100/  247 batches | accuracy    0.732 | time 0.336
| epoch   2 |   150/  247 batches | accuracy    0.721 | time 0.291
| epoch   2 |   200/  247 batches | accuracy    0.737 | time 0.332
-----------------------------------------------------------------
| end of epoch   2 | time:  1.70s | valid accuracy    0.711 
-----------------------------------------------------------------
| epoch   3 |    50/  247 batches | accuracy    0.728 | time 0.336
| epoch   3

In [151]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader, criterion)
print('test accuracy {:8.2f}'.format(accu_test*100))

Checking the results of test dataset.
test accuracy    71.32


In [159]:
classes = {}
for i in range(6):
  classes[i] = df["label_name"][df["label"]==i].unique()[0]

In [160]:
classes

{0: 'bug',
 1: 'design',
 2: 'documentation',
 3: 'feature',
 4: 'help',
 5: 'question'}

In [167]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

In [162]:
model = model.to("cpu")

In [172]:
classes[predict("This is a bug", text_pipeline)]

'bug'

In [174]:
torch.save(model, '/content/drive/MyDrive/github-issue-bot/models/model.pth')