In [61]:
import operator
import os
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
import multiset

from utils import load_embeddings, load_projects, load_stopwords

In [62]:
projects, imapping = load_projects('.', 'java-projects - java-projects (1).csv')
labels = {k:v for k,v in zip(projects['names'], projects['labels_id'])}

In [63]:
path = "resources/java/stopwords.txt"
stopwords = load_stopwords(path)
path = "resources/en/stopwords.txt"
stopwords.update(load_stopwords(path))

In [64]:

terms_path = '../data/embeddings/terms-count/'
mapping = {v:k for k,v in imapping.items()}

In [65]:
from textblob import  Word

category_terms_count = defaultdict(lambda: Counter())
category_terms_occ = defaultdict(lambda: Counter())
text = []
lab = []
for project in labels:
    category = mapping[labels[project]]
    try:
        terms_count = load_embeddings(os.path.join(terms_path, f"{project}.vec"))
        terms = []
        for x, y in terms_count.items():
            lemma = Word(x).lemmatize()
            if lemma not in stopwords and len(x) > 1 and x not in stopwords:
                tokens = [lemma] * int(y[0])
                terms.extend(tokens)
        #terms = [[Word(x).lemmatize()] * int(y[0]) for x, y in terms_count.items() if Word(x).lemmatize() not in stopwords and len(x) > 1]
        text.append(" ".join(terms))
        lab.append(category)
    except Exception as e:
        print(e)

[Errno 2] No such file or directory: '../data/embeddings/terms-count/asm.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/infinispan.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/infer.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/source.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/org.aspectj.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/JGroups.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/checker-framework.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/libgdx.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/eclipse.platform.swt.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/reflectasm.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/flink.vec'
[Errno 2] No such file or directory: '../data/embeddings/terms-count/Smack.

In [66]:
import pandas

df = pandas.DataFrame({'content': text, 'label': lab})
df = df[~df['label'].isin(['NA', 'Miscellaneous'])]

class_names = set(df['label'].tolist())

#X_train, X_test, y_train, y_test = train_test_split(
#        X, y,stratify=y, test_size=0.3)

In [67]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [68]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [69]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [70]:
MAX_LEN = 512

In [71]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1,
  random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [72]:
class SourceCodeDataset(Dataset):
  def __init__(self, code, targets, tokenizer, max_len):
    self.code = code
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.code)

  def __getitem__(self, item):
    code = str(self.code[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      code,
      add_special_tokens=True,
      max_length=self.max_len,
      truncation=True,
      return_token_type_ids=False,
      padding=True,
      #pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'code_text': code,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [73]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SourceCodeDataset(
    code=df.content.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=1
  )

In [74]:
BATCH_SIZE = 2
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [75]:
class CategoryClassification(nn.Module):
  def __init__(self, n_classes):
    super(CategoryClassification, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [76]:
model = CategoryClassification(len(class_names))
model = model.to(device)

In [77]:
EPOCHS = 10
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [78]:
from tqdm import tqdm
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in tqdm(data_loader):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [79]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)




In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

  0%|          | 0/200 [00:21<?, ?it/s]


Epoch 1/10
----------
