In [3]:
from catboost import Pool, CatBoostClassifier
from utils import *
from sklearn.metrics import accuracy_score
import numpy as np 
from sklearn.utils.class_weight import compute_class_weight

In [20]:
df = pd.read_parquet('data/dataWithoutComments.parquet')
# df = pd.read_parquet('data/df.parquet.gzip')

x_train, x_test, y_train, y_test = download_train_test(df)

In [21]:
enc_dict = [
  'TGLANG_LANGUAGE_OTHER',
  'TGLANG_LANGUAGE_1S_ENTERPRISE',
  'TGLANG_LANGUAGE_ABAP',
  'TGLANG_LANGUAGE_ACTIONSCRIPT',
  'TGLANG_LANGUAGE_ADA',
  'TGLANG_LANGUAGE_APACHE_GROOVY',
  'TGLANG_LANGUAGE_APEX',
  'TGLANG_LANGUAGE_APPLESCRIPT',
  'TGLANG_LANGUAGE_ASP',
  'TGLANG_LANGUAGE_ASSEMBLY',
  'TGLANG_LANGUAGE_AUTOHOTKEY',
  'TGLANG_LANGUAGE_AWK',
  'TGLANG_LANGUAGE_BASIC',
  'TGLANG_LANGUAGE_BATCH',
  'TGLANG_LANGUAGE_BISON',
  'TGLANG_LANGUAGE_C',
  'TGLANG_LANGUAGE_CLOJURE',
  'TGLANG_LANGUAGE_CMAKE',
  'TGLANG_LANGUAGE_COBOL',
  'TGLANG_LANGUAGE_COFFESCRIPT',
  'TGLANG_LANGUAGE_COMMON_LISP',
  'TGLANG_LANGUAGE_CPLUSPLUS',
  'TGLANG_LANGUAGE_CRYSTAL',
  'TGLANG_LANGUAGE_CSHARP',
  'TGLANG_LANGUAGE_CSS',
  'TGLANG_LANGUAGE_CSV',
  'TGLANG_LANGUAGE_D',
  'TGLANG_LANGUAGE_DART',
  'TGLANG_LANGUAGE_DELPHI',
  'TGLANG_LANGUAGE_DOCKER',
  'TGLANG_LANGUAGE_ELIXIR',
  'TGLANG_LANGUAGE_ELM',
  'TGLANG_LANGUAGE_ERLANG',
  'TGLANG_LANGUAGE_FIFT',
  'TGLANG_LANGUAGE_FORTH',
  'TGLANG_LANGUAGE_FORTRAN',
  'TGLANG_LANGUAGE_FSHARP',
  'TGLANG_LANGUAGE_FUNC',
  'TGLANG_LANGUAGE_GAMS',
  'TGLANG_LANGUAGE_GO',
  'TGLANG_LANGUAGE_GRADLE',
  'TGLANG_LANGUAGE_GRAPHQL',
  'TGLANG_LANGUAGE_HACK',
  'TGLANG_LANGUAGE_HASKELL',
  'TGLANG_LANGUAGE_HTML',
  'TGLANG_LANGUAGE_ICON',
  'TGLANG_LANGUAGE_IDL',
  'TGLANG_LANGUAGE_INI',
  'TGLANG_LANGUAGE_JAVA',
  'TGLANG_LANGUAGE_JAVASCRIPT',
  'TGLANG_LANGUAGE_JSON',
  'TGLANG_LANGUAGE_JULIA',
  'TGLANG_LANGUAGE_KEYMAN',
  'TGLANG_LANGUAGE_KOTLIN',
  'TGLANG_LANGUAGE_LATEX',
  'TGLANG_LANGUAGE_LISP',
  'TGLANG_LANGUAGE_LOGO',
  'TGLANG_LANGUAGE_LUA',
  'TGLANG_LANGUAGE_MAKEFILE',
  'TGLANG_LANGUAGE_MARKDOWN',
  'TGLANG_LANGUAGE_MATLAB',
  'TGLANG_LANGUAGE_NGINX',
  'TGLANG_LANGUAGE_NIM',
  'TGLANG_LANGUAGE_OBJECTIVE_C',
  'TGLANG_LANGUAGE_OCAML',
  'TGLANG_LANGUAGE_OPENEDGE_ABL',
  'TGLANG_LANGUAGE_PASCAL',
  'TGLANG_LANGUAGE_PERL',
  'TGLANG_LANGUAGE_PHP',
  'TGLANG_LANGUAGE_PL_SQL',
  'TGLANG_LANGUAGE_POWERSHELL',
  'TGLANG_LANGUAGE_PROLOG',
  'TGLANG_LANGUAGE_PROTOBUF',
  'TGLANG_LANGUAGE_PYTHON',
  'TGLANG_LANGUAGE_QML',
  'TGLANG_LANGUAGE_R',
  'TGLANG_LANGUAGE_RAKU',
  'TGLANG_LANGUAGE_REGEX',
  'TGLANG_LANGUAGE_RUBY',
  'TGLANG_LANGUAGE_RUST',
  'TGLANG_LANGUAGE_SAS',
  'TGLANG_LANGUAGE_SCALA',
  'TGLANG_LANGUAGE_SCHEME',
  'TGLANG_LANGUAGE_SHELL',
  'TGLANG_LANGUAGE_SMALLTALK',
  'TGLANG_LANGUAGE_SOLIDITY',
  'TGLANG_LANGUAGE_SQL',
  'TGLANG_LANGUAGE_SWIFT',
  'TGLANG_LANGUAGE_TCL',
  'TGLANG_LANGUAGE_TEXTILE',
  'TGLANG_LANGUAGE_TL',
  'TGLANG_LANGUAGE_TYPESCRIPT',
  'TGLANG_LANGUAGE_UNREALSCRIPT',
  'TGLANG_LANGUAGE_VALA',
  'TGLANG_LANGUAGE_VBSCRIPT',
  'TGLANG_LANGUAGE_VERILOG',
  'TGLANG_LANGUAGE_VISUAL_BASIC',
  'TGLANG_LANGUAGE_WOLFRAM',
  'TGLANG_LANGUAGE_XML',
  'TGLANG_LANGUAGE_YAML']


In [22]:
enc_dict = {key: i for i, key in enumerate(enc_dict)}

In [23]:
y_train = y_train['language'].map(enc_dict.get)
y_test = y_test['language'].map(enc_dict.get)

In [24]:
cat_features = []
text_features = ['code']

learn_pool = Pool(
    x_train,
    y_train,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(x_train)
)
test_pool = Pool(
    x_test,
    y_test,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(x_train)
)

In [25]:
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [26]:
catboost_default_params = {
    'iterations': 1000,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'loss_function': 'MultiClassOneVsAll',
    'class_weights': class_weights,
    'random_seed': 42,
    'task_type':"GPU",
    'devices':'0:1'
}

catboost_params = {}
catboost_default_params.update(catboost_params)

In [27]:
model = CatBoostClassifier(**catboost_default_params)

In [28]:
model.fit(learn_pool, eval_set=test_pool, verbose=100)

0:	learn: 0.0654539	test: 0.0639119	best: 0.0639119 (0)	total: 389ms	remaining: 6m 28s
100:	learn: 0.7275013	test: 0.7442790	best: 0.7442790 (100)	total: 49.7s	remaining: 7m 22s
200:	learn: 0.7974308	test: 0.7914091	best: 0.7914091 (200)	total: 1m 37s	remaining: 6m 28s
300:	learn: 0.8345639	test: 0.8020203	best: 0.8021030 (299)	total: 2m 30s	remaining: 5m 48s
400:	learn: 0.8582124	test: 0.8098099	best: 0.8099408 (391)	total: 3m 22s	remaining: 5m 3s
500:	learn: 0.8737274	test: 0.8185703	best: 0.8185703 (500)	total: 4m 20s	remaining: 4m 19s
600:	learn: 0.8839968	test: 0.8201689	best: 0.8217683 (593)	total: 5m 15s	remaining: 3m 29s
700:	learn: 0.8930238	test: 0.8232683	best: 0.8234909 (676)	total: 6m 13s	remaining: 2m 39s
800:	learn: 0.9003503	test: 0.8236067	best: 0.8243402 (733)	total: 7m 9s	remaining: 1m 46s
900:	learn: 0.9061369	test: 0.8249886	best: 0.8252163 (843)	total: 8m 6s	remaining: 53.5s
999:	learn: 0.9105854	test: 0.8250127	best: 0.8253060 (924)	total: 9m 2s	remaining: 0us
be

<catboost.core.CatBoostClassifier at 0x2b5ff290290>

In [29]:
string = '''from dataset import CodeDataset
from config import *
from tqdm import trange, tqdm
from transformers import RobertaForSequenceClassification, RobertaConfig, RobertaModel
from utils import *
from torch.utils.data import WeightedRandomSampler
import wandb
from collections import Counter
import os

run = wandb.init(
    # set the wandb project where this run will be logged
    project="Tgml",

    # track hyperparameters and run metadata
    config={
        "learning_rate": LR,
        "architecture": "codeBerta",
        "epochs": EPOCH,
        'optimizer': 'Adam'
    }
)

trainData = np.load('data/TOKENIZEDtrainData.npy')
validationData = np.load('data/TOKENIZEDvalidationData.npy')
trainLabels = pd.read_parquet('data/y_train.parquet').to_numpy()
validationLabels = pd.read_parquet('data/y_test.parquet').to_numpy()

train_dataset = CodeDataset(trainData, trainLabels, LANGUAGES_TO_INT)
eval_dataset = CodeDataset(validationData, validationLabels, LANGUAGES_TO_INT)

model = RobertaForSequenceClassification.from_pretrained(CODEBERTA_PRETRAINED, num_labels=len(LANGUAGES_TO_INT),
                                                         ignore_mismatched_sizes=True)

count = Counter(trainLabels.T[0])
count = {k: 100/v for k, v in count.items()}
squarer = lambda x: count[x]
vfunc = np.vectorize(squarer)
weights = vfunc(trainLabels.T[0])

sampler = WeightedRandomSampler(weights, 150000, replacement=True)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)

model.to("cuda")
model.train()
for param in model.roberta.parameters():
    param.requires_grad = False

print(f"num params:", model.num_parameters())
print(f"num trainable params:", model.num_parameters(only_trainable=True))

trainLA = []
validationLA = []
best_loss = np.inf
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
for e in trange(0, EPOCH, desc="Epoch"):
    train_loss = 0.0
    nb_train_steps = 0
    preds = np.empty(0, dtype=np.int64)
    out_label_ids = np.empty(0, dtype=np.int64)
    for step, (input_ids, labels) in enumerate(tqdm(train_dataloader, desc="Iteration")):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to("cuda"), labels=labels.to("cuda"))
        loss = outputs[0]
        loss.backward()
        logits = outputs[1]
        train_loss += loss.mean().item()
        nb_train_steps += 1
        preds = np.append(preds, logits.argmax(dim=1).detach().cpu().numpy(), axis=0)
        out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
        optimizer.step()
        # del input_ids
        # del labels
        # del outputs
        # torch.cuda.empty_cache()
    train_loss = train_loss / nb_train_steps
    acc = simple_accuracy(preds, out_label_ids)
    f1 = f1_score(y_true=out_label_ids, y_pred=preds, average="macro")
    print("=== Train: loss ===", train_loss)
    print("=== Train: acc. ===", acc)
    print("=== Train: f1 ===", f1)

    wandb.log({"train_loss": train_loss, "train_acc": acc})

    val = evaluate(model, eval_dataset)
    if val[0] < best_loss:
        best_loss = val[0]
        model.save_pretrained(f"model/hf/")

    trainLA += [[train_loss, acc]]
    validationLA += [val]
    wandb.log({"val_loss": val[0], "val_acc": val[1]})
    model.train()
wandb.finish()'''


In [30]:
len(string)

3291

In [31]:
model.predict([string])

array([73], dtype=int64)

In [170]:
x_train[y_train['language'] == 'TGLANG_LANGUAGE_INI']

Unnamed: 0,code
449334,[OEMFiles]\nOEMDriverFile1=hpf3rw73.dll\nOEMCo...
449314,[Configuration]\nInstallDir=c:\totalcmd\nUseNe...
449296,[file_info]\ntype=lms7002m_minimal_config\nver...
449347,[DriverConfig]\nDataFile=hpcP6wn8_MA_HWCP.GPD\...
449375,[DriverConfig]\nDataFile=MSxpsXPS.gpd\nRequire...
...,...
449262,# This file is automatically generated by Andr...
449326,[DirectInput]\nDirectXVersion=0x800\nDevices=U...
449255,; This is the main server configuration\n; Mos...
449382,[DriverConfig] \nDataFile=hprasterZJS.gpd \nRe...


In [154]:
x_train[y_train['language'] == 'TGLANG_LANGUAGE_IDL']

Unnamed: 0,code
10973,"non_repeated_values = array[uniq(array, sort( ..."
23434,"print,'Hello world!'"
7860,result = array[sort(array)]
7030,"function qs, arr\n if (count = n_elements(arr..."
3262,deg = 35 ; arbitrary number of degrees...
5574,"str = ""alphaBETA""\nprint, str\nprint, strupcas..."
24587,"print,(x = randomu(seed,8)*100)\n 15.1473 ..."
26695,"function fact,n\n return, product(lindgen(n)..."
17503,result = arr1 # arr2
24659,"test:\n..some code here\ngoto, test"


In [32]:
predict = model.predict(x_test)

In [33]:
res = {}
for lang in y_train.unique():
    mask = y_test == lang
    res[lang] = (predict[mask] == lang).sum() / mask.sum()

In [34]:
accuracy_score(y_test, predict)

0.849851615089133

In [35]:
sorted(res.items(), key=lambda x: x[1])

[(47, 0.3037974683544304),
 (33, 0.4),
 (63, 0.5),
 (89, 0.5),
 (2, 0.5714285714285714),
 (22, 0.5833333333333334),
 (97, 0.6102564102564103),
 (60, 0.6133333333333333),
 (66, 0.6193548387096774),
 (65, 0.6363636363636364),
 (86, 0.6491683704481406),
 (38, 0.6659751037344398),
 (70, 0.6686390532544378),
 (93, 0.6818181818181818),
 (3, 0.6818181818181818),
 (94, 0.6875),
 (61, 0.7058823529411765),
 (50, 0.7059702887041016),
 (80, 0.7142857142857143),
 (28, 0.7358490566037735),
 (83, 0.7580645161290323),
 (32, 0.76),
 (92, 0.7602572632512752),
 (44, 0.7609841827768014),
 (84, 0.7627118644067796),
 (14, 0.7699115044247787),
 (51, 0.7699912126537786),
 (71, 0.776595744680851),
 (41, 0.7777777777777778),
 (64, 0.782258064516129),
 (17, 0.7879848352289297),
 (76, 0.788135593220339),
 (9, 0.7962529274004684),
 (20, 0.7988505747126436),
 (56, 0.8),
 (46, 0.8024691358024691),
 (36, 0.8125),
 (13, 0.817841726618705),
 (68, 0.8180039138943248),
 (10, 0.8181818181818182),
 (75, 0.8181818181818182)

In [36]:
model.save_model("model/catboost_f1_0.84.cbm")