In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

from sklearn.model_selection import GroupShuffleSplit
from bisect import bisect
from sklearn.metrics import mean_squared_error

from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModel

import torch.nn as nn
import copy

from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer

import time

import matplotlib.pyplot as pl


In [None]:
pd.options.display.width = 180
pd.options.display.max_colwidth = 120

bert = AutoModel.from_pretrained("microsoft/codebert-base")
data_dir = Path('../input/AI4Code')

In [None]:
#Read dataset
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')  

In [None]:
#Get the kendall tau corelation values
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [2]:
#Get the training data

NUM_TRAIN = 1000

paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)


In [3]:
#Combine all of csv into one table
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

def get_ranks(base, derived):
    return [base.index(d) for d in derived]

#nb

df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

#df_ranks

df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
#df_ancestors

df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])

In [None]:
#Normalize the rangking for each snippet of code
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")

In [4]:
#Split the data into 2 sets
NVALID = 1/3 

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

In [7]:
#Get the markdown data
train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

In [67]:
#Get the size of modified networks
# 1 attention block
print(7087872 - sum([2359296,3072,2359296,768,768,768]))
# 1 intermediate block
print(sum([2359296,3072,2359296,768,768,768]))

In [59]:
#Get the parameters of modified networks with different encoder layers
size_l = []
for i in range(0, 13):
    print(i, 'encoder layer: ', round((7087872 * i + 38999808) * 0.000001, 2))
    size_l.append(round((7087872 * i + 38999808) * 0.000001, 2))
size_l

In [63]:
#Get the size of modified networks with different encoder layers
size_l_d = []
for i in range(13):
    size_l_d.append(round(size_l[12] / size_l[i],2))
size_l_d

In [39]:
#Get the parameters of modified networks with different intermediate blocks
size_i = []
for i in range(0, 13):
    print(i, 'intermediate block: ', (7087872 - sum([2359296,3072,2359296,768,768,768])) * 12 + 38999808 +  (sum([2359296,3072,2359296,768,768,768])) * i)
    size_i.append(round(((7087872 - sum([2359296,3072,2359296,768,768,768])) * 12 + 38999808 +  sum([2359296,3072,2359296,768,768,768]) * i) * 0.000001,2))
size_i

In [41]:
#Get the parameters of modified networks with different self-attention blocks
size_a = []
for i in range(0, 13):
    print(i, 'self-attention block: ', sum([2359296,3072,2359296,768,768,768]) * 12 + 38999808 +  (7087872 - sum([2359296,3072,2359296,768,768,768])) * i)
    size_a.append(round((sum([2359296,3072,2359296,768,768,768]) * 12 + 38999808 +  (7087872 - sum([2359296,3072,2359296,768,768,768])) * i) * 0.000001,2))
size_a

In [48]:
#Get the size of modified networks with different self-attention blocks
size_a_d = []
for i in range(13):
    size_a_d.append(round(size_a[12] / size_a[i],2))
size_a_d

In [10]:
#Get the different intermediate blocks
def deletelayers(model, num_layers_to_keep,n):
    oldModuleList = model.encoder.layer
    newModuleList = nn.ModuleList()
    
    for j in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[j])
        
    for i in range(num_layers_to_keep,11):
        newModuleList.append(oldModuleList[i].attention)
        
    copyofModel = copy.deepcopy(model)
    copyofModel.encoder.layer = newModuleList
    
    return copyofModel

In [11]:
#Get the different self-attention blocks
def deletelayers_attention(model, num_layers_to_keep,n):
    oldModuleList = model.encoder.layer
    newModuleList = nn.ModuleList()
    
    for j in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[j])
        
    for i in range(num_layers_to_keep,11):
        newModuleList.append(oldModuleList[i].intermediate)
        newModuleList.append(oldModuleList[i].output)
        
    copyofModel = copy.deepcopy(model)
    copyofModel.encoder.layer = newModuleList
    
    return copyofModel

In [12]:
#Get the different encoder layers
def deletelayers_layer(model, num_layers_to_keep,n):
    oldModuleList = model.encoder.layer
    newModuleList = nn.ModuleList()
    
    for j in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[j])
        
    copyofModel = copy.deepcopy(model)
    copyofModel.encoder.layer = newModuleList
    
    return copyofModel

In [13]:
##Get the different combination of intermediate blocks and encoder layers
def deletelayers_layer_attention(model, num_layers_to_keep,n):
    oldModuleList = model.encoder.layer
    newModuleList = nn.ModuleList()
    
    for j in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[j])
        
    for i in range(num_layers_to_keep,n):
        newModuleList.append(oldModuleList[i].attention)
        
    copyofModel = copy.deepcopy(model)
    copyofModel.encoder.layer = newModuleList
    
    return copyofModel

In [14]:
#Build a model
class MarkdownModel(nn.Module):
    def __init__(self,deletemodel,num_intermediate_to_keep, num_layer):
        super(MarkdownModel, self).__init__()
        #self.distill_bert = DistilBertModel.from_pretrained('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased')
        self.num_intermediate_to_keep = num_intermediate_to_keep 
        self.num_layer = num_layer
        self.deletemodel = deletemodel
        self.distill_bert = self.deletemodel(bert, self.num_intermediate_to_keep, self.num_layer)
        self.top1 = nn.Linear(768, 64)
        self.top2 = nn.Linear(64, 1)

        self.dropout1 = torch.nn.Dropout(p=0.2)
        self.dropout2 = torch.nn.Dropout(p=0.2)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0][:, 0, :]
        x = self.dropout1(x)
        x0 = self.top1(x)
        x = self.dropout2(x0)
        x = self.top2(x)
        x = torch.sigmoid(x)
        return x

In [15]:
#Customize a dataset
class MarkdownDataset(Dataset):
    
    def __init__(self, df, max_len):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        #self.tokenizer = DistilBertTokenizer.from_pretrained('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased', do_lower_case=True)
        self.tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", do_lower_case=False)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        return ids, mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]
    

In [None]:
#Get dataloader
train_ds = MarkdownDataset(train_df_mark, max_len=MAX_LEN)
val_ds = MarkdownDataset(val_df_mark, max_len=MAX_LEN)


BS = 32
NW = 2
MAX_LEN = 128

train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

In [16]:
#Get the best learning rate
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 2:
        lr = 1e-3
    elif epoch < 5:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer


In [19]:
#Train and test data
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)
            #print(inputs, target)

            pred = model(inputs[0], inputs[1])

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)

def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    
    optimizer = get_optimizer(model)

    criterion = torch.nn.MSELoss()
    
    for e in range(epochs):
        start = 0
        start = time.time()
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            pred = model(inputs[0], inputs[1])

            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
            
            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
        
        y_train, y_pred_train = validate(model, train_loader)
        y_val, y_pred = validate(model, val_loader)
        
        end = 0
        end = time.time()
        print("Training MSE:", np.round(mean_squared_error(y_train, y_pred_train), 4))    
        print("Validation MSE:", np.round(mean_squared_error(y_val, y_pred), 4))
        print('Running time:', end - start)
        print()
    return model, y_pred, np.round(mean_squared_error(y_train, y_pred_train), 4), np.round(mean_squared_error(y_val, y_pred), 4),end - start

In [22]:
#3 intermediate blocks and 8 layers 
train_error_n = []
vali_error_n = []
kt_error_n = []
time_list_n = []
for num_layers_to_keep in range(2, 8):
    model = MarkdownModel(deletelayers_layer_attention, 3, 8)
    model = model.cuda()
    model, y_pred, train_MSE, vali_MSE, time_n = train(model, train_loader, val_loader, epochs=1)
    train_error_n.append(train_MSE)
    vali_error_n.append(vali_MSE)
    time_list_n.append(time_n)
    torch.save(model, 'codebert-trained2.pkl')
    val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
    val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
    y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
    kt_error_n.append(kendall_tau(df_orders.loc[y_dummy.index], y_dummy))
print(kt_error_n)

In [32]:
#Plot the chart about the different numbers of intermediate blocks with 8 Encoder Layers vs Kendall Tau Correlation
fig, ax = plt.subplots()
x = np.arange(2,8,1)
ax.plot(x,kt_error_n, label='Kendall Tau Correlation')
plt.xlim(2,7)
plt.xlabel('the Numbers of Intermediate Blocks with 8 Encoder Layers')
plt.ylabel('Kendall Tau Correlation Values')

In [137]:
#Get errors of the different numbers of intermediate blocks with 12 encoder layers
train_error = []
vali_error = []
kt_error = []
time_list = []
for num_layers_to_keep in range(13):
    #mstart = torch.cuda.memory_allocated(torch.cuda.current_device())
    model = MarkdownModel(deletelayers, num_layers_to_keep,0)
    model = model.cuda()
    model, y_pred, train_MSE, vali_MSE, time_n = train(model, train_loader, val_loader, epochs=1)
    train_error.append(train_MSE)
    vali_error.append(vali_MSE)
    time_list.append(time_n)
    torch.save(model, 'codebert-trained2.pkl')
    val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
    val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
    y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
    kt_error.append(kendall_tau(df_orders.loc[y_dummy.index], y_dummy))
    #mend = torch.cuda.memory_allocated(torch.cuda.current_device())
    print("torch.cuda.memory_allocated: %fKB"%(torch.cuda.memory_allocated(0)))
    print("torch.cuda.memory_reserved: %fKB"%(torch.cuda.memory_reserved(0)))
    print("torch.cuda.max_memory_reserved: %fKB"%(torch.cuda.max_memory_reserved(0)))
    
#print(np.mean(train_error)) 
#print(np.mean(vali_error))

In [169]:
#Get the time decrese for the different numbers of intermediate blocks with 12 encoder layers
t_ans = []
for i in range(13):
    t_ans.append((time_list[12] - time_list[i]) / time_list[12])
t_ans

In [34]:
#Get the size decrese for the different numbers of intermediate blocks with 12 encoder layers
si_ans = []
size_i = [67.37,72.09,76.81,81.54,86.26,90.99,95.71,100.43,105.16,109.88,114.61,119.33,124.05]
for i in range(12,-1,-1):
    si_ans.append((size_i[12] - size_i[i]) / size_i[12])
si_ans

In [45]:
fig, ax = plt.subplots() 
x = np.arange(0,13,1)
ax.plot(kt_error, label='Kendall Tau Correlation')
ax.plot(t_ans, label='Running Time Increase')
ax.plot(si_ans, label='Size Decrease')
ax.legend(loc='upper left')
plt.xlabel('the Numbers of Intermediate Blocks')
plt.ylabel('Values')

In [190]:
##Get errors of the different numbers of encoder layers
train_error_l = []
vali_error_l = []
kt_error_l = []
time_list_l = []
for num_layers_to_keep in range(1,13):
    model = MarkdownModel(deletelayers_layer, num_layers_to_keep,12)
    model = model.cuda()
    model, y_pred, train_MSE, vali_MSE, time_n = train(model, train_loader, val_loader, epochs=1)
    train_error_l.append(train_MSE)
    vali_error_l.append(vali_MSE)
    time_list_l.append(time_n)
    torch.save(model, 'codebert-trained2.pkl')
    val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
    val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
    y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
    kt_error_l.append(kendall_tau(df_orders.loc[y_dummy.index], y_dummy))

In [29]:
fig, ax = plt.subplots() 
x = np.linspace(0,13) 
ax.plot(kt_error_a)
ax.legend(loc='upper left')
plt.xlabel('the Numbers of Self-attention Blocks')
plt.ylabel('Kendall Tau Correlation')

In [89]:
fig, ax = plt.subplots()
y1 = kt_error_l
ax.plot( y1, label='linear') 

In [65]:
#Get the time decrease for the different numbers of encoder layers
time_l_d = []
for i in range(13):
    time_l_d.append(round(time_list_l[12]/time_list_l[i],2))
time_l_d