In [9]:
import warnings

warnings.filterwarnings("ignore")  # ignore warnings

import os
import re

from time import time
from tqdm import tqdm  # to track the processing of loop
import pandas as pd  # data preprocessing
import numpy as np  # numerical python processing(linear algebra)
import matplotlib.pyplot as plt  # visualization
import seaborn as sns  # Visualization on top of Matplotlib

# metrics to assess the model performance
from sklearn.metrics import f1_score, precision_score, recall_score


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score, roc_curve

# For Deep Learning models
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

import pickle  # to save model
from pathlib import Path

In [3]:
root_dir = '/home/shaurya/BTP/1/'
dataset_dir = '/home/shaurya/BTP/1/Outputs/CSV'

In [4]:
df= pd.read_csv(dataset_dir + 'preprocessed_df_100k.csv')

In [5]:
import code_tokenize
def my_tokenizer(text):
    text = re.sub(r'\b\d+\.\d+\b', lambda x: str(float(x.group())), text)
    tokens = code_tokenize.tokenize(text, lang="c", syntax_error="ignore")
    tokens_list = [str(x) for x in tokens]
    return tokens_list

In [6]:
# tokenize the data that can be used by embeddings
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer_path = root_dir + "diverse_model/tokenizer.pkl"
retrain = False

if os.path.exists(tokenizer_path) and not retrain:
    print("[INFO] Loading saved tokenizer")
    with open(tokenizer_path, "rb") as tokenizer_file:
        tokenizer = pickle.load(tokenizer_file)
else:
    t0 = time()
    print("[INFO] Fitting tokenizer on TRAIN data...")
    tokenizer = Tokenizer(lower=False, analyzer=my_tokenizer)  # used in the research paper
    tokenizer.fit_on_texts(X_train.apply(lambda x: str(x)))
    # Save the tokenizer for future use
    with open(tokenizer_path, "wb") as tokenizer_file:
        pickle.dump(tokenizer, tokenizer_file, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"Time taken to fit and save the tokenizer: {time() - t0} s")

[INFO] Loading saved tokenizer


In [16]:
X_train = [str(code) for code in df.func]
X_train_tokens = tokenizer.texts_to_sequences(X_train)

# 2. Create an empty list to store the sentence strings
sentences = []

# 3. Iterate through the tokenized sequences and create sentence strings
for token_sequence in X_train_tokens:
  sentence = [tokenizer.index_word[token] for token in token_sequence]
  sentences.append(sentence)


# Now, the 'sentences' variable will contain the token sequences as strings

In [28]:
for te in sentences[:5]:
    print("NICE\n")
    print('_______________________________________________________')
    print(te, "\n")  # unpacking the list and using newline as separator

NICE

_______________________________________________________
['int', '_gnutls_ciphertext2compressed', '(', 'gnutls_session_t', 'session', ',', 'opaque', '*', 'compress_data', ',', 'int', 'compress_size', ',', 'gnutls_datum_t', 'ciphertext', ',', 'uint8', 'type', ')', '{', 'uint8', 'MAC', '[', 'MAX_HASH_SIZE', ']', ';', 'uint16', 'c_length', ';', 'uint8', 'pad', ';', 'int', 'length', ';', 'mac_hd_t', 'td', ';', 'uint16', 'blocksize', ';', 'int', 'ret', ',', 'i', ',', 'pad_failed', '=', '0', ';', 'uint8', 'major', ',', 'minor', ';', 'gnutls_protocol_t', 'ver', ';', 'int', 'hash_size', '=', '_gnutls_hash_get_algo_len', '(', 'session', '->', 'security_parameters', '.', 'read_mac_algorithm', ')', ';', 'ver', '=', 'gnutls_protocol_get_version', '(', 'session', ')', ';', 'minor', '=', '_gnutls_version_get_minor', '(', 'ver', ')', ';', 'major', '=', '_gnutls_version_get_major', '(', 'ver', ')', ';', 'blocksize', '=', '_gnutls_cipher_get_block_size', '(', 'session', '->', 'security_parameters'

In [31]:
import os
import pickle
from gensim.models import Word2Vec


output_dir = '/home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec'

embedding_dims = [50, 100, 150, 200]  # Different embedding sizes
window_size = 10
min_count = 1

# Iterate over different embedding sizes
for embedding_dim in embedding_dims:
    
    word2vec_model = Word2Vec(sentences, vector_size=embedding_dim, window=window_size, min_count=min_count)

    # Save the Word2Vec model using pickle with a specific name
    model_name = f'word2vec_{embedding_dim}_model.pkl'
    model_path = os.path.join(output_dir, model_name)
    with open(model_path, 'wb') as f:
        pickle.dump(word2vec_model, f)

    print(f"Embedding model (size {embedding_dim}) saved at:", model_path)


Embedding model (size 50) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec/word2vec_50_model.pkl
Embedding model (size 100) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec/word2vec_100_model.pkl
Embedding model (size 150) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec/word2vec_150_model.pkl
Embedding model (size 200) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec/word2vec_200_model.pkl


In [34]:
from gensim.models import FastText

output_dir = '/home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText'

for embedding_dim in embedding_dims:
    
    fasttext_model = FastText(sentences, vector_size=embedding_dim, window=window_size, min_count=min_count)

    # Save the FastText model using pickle with a specific name
    model_name = f'fasttext_{embedding_dim}_model.pkl'
    model_path = os.path.join(output_dir, model_name)
    with open(model_path, 'wb') as f:
        pickle.dump(fasttext_model, f)

    print(f"Embedding model (size {embedding_dim}) saved at:", model_path)


Embedding model (size 50) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText/fasttext_50_model.pkl
Embedding model (size 100) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText/fasttext_100_model.pkl
Embedding model (size 150) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText/fasttext_150_model.pkl
Embedding model (size 200) saved at: /home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText/fasttext_200_model.pkl


In [15]:
import pickle

vocab_path = '/home/shaurya/BTP/1/Outputs/Embedding Models 3/Word2vec/word2vec_150_model.pkl'
# Load the Word2Vec model from the pickle file
with open(vocab_path, 'rb') as f:
    word2vec_model = pickle.load(f)

# Get the vocabulary size
vocab_size = len(word2vec_model.wv.key_to_index)

# Get a few vocabulary words
few_vocab_words = list(word2vec_model.wv.key_to_index.keys())[:100]  # Adjust the number as needed

# Get embeddings for some words
word_embeddings = {}
words_to_check = ['kmknkn', 'if', 'then']  # Add words you want to check
for word in words_to_check:
    if word in word2vec_model.wv.key_to_index:
        word_embeddings[word] = word2vec_model.wv.get_vector(word)

# Print information
print("Vocabulary size:", vocab_size)
print("Few vocabulary words:", few_vocab_words)
print("Embeddings for some words:", word_embeddings)


Vocabulary size: 559321
Few vocabulary words: ['(', ')', ';', ',', '->', '=', '"', '{', '}', '*', 'if', '.', '0', '&', ':', ']', '[', 'return', '==', '1', 'NULL', 'int', 'i', 'struct', '!', "'", 'case', '!=', '+', '-', 'else', 'break', '&&', 'char', '<', 'p', '++', 's', 'goto', 'image', 'void', 'const', '\\n', 'ret', '', '||', 'unsigned', '>', 'sizeof', '2', 'data', 'len', 'static', 'c', 'for', 'err', 'r', '+=', 'buf', ' ', '-1', 'n', 'type', 'flags', '<<', 'ctxt', 'name', 'x', '4', 'e', 'sk', 'error', 'skb', 'state', 'size', '|', 't', 'ctx', 'offset', 'dev', '?', 'y', '3', 'value', 'length', 'a', 'long', 'status', 'result', '>=', 'out', 'q', 'j', 'rc', 'size_t', 'while', '8', '/', 'file', 'buffer']
Embeddings for some words: {'if': array([-0.48028994,  2.466092  , -1.6403556 ,  0.9521284 ,  0.63954234,
        0.64652157, -2.207914  , -0.31819314, -2.1386666 ,  1.3634223 ,
       -4.528823  , -1.5397035 , -3.191879  ,  5.520243  , -1.5376232 ,
        3.443389  ,  0.29147637,  1.56102

In [13]:
import pickle

# Load the Word2Vec model from the pickle file
vocab_path = '/home/shaurya/BTP/1/Outputs/Embedding Models 3/FastText/fasttext_150_model.pkl'
with open(vocab_path, 'rb') as f:
    word2vec_model = pickle.load(f)

# Get the vocabulary size
vocab_size = len(word2vec_model.wv.key_to_index)

# Get a few vocabulary words
few_vocab_words = list(word2vec_model.wv.key_to_index.keys())[:100]  # Adjust the number as needed

# Get embeddings for some words
word_embeddings = {}
words_to_check = ['word1', 'if', 'then']  # Add words you want to check
for word in words_to_check:
    if word in word2vec_model.wv.key_to_index:
        word_embeddings[word] = word2vec_model.wv.get_vector(word)

# Print information
print("Vocabulary size:", vocab_size)
print("Few vocabulary words:", few_vocab_words)
print("Embeddings for some words:", word_embeddings)


Vocabulary size: 559321
Few vocabulary words: ['(', ')', ';', ',', '->', '=', '"', '{', '}', '*', 'if', '.', '0', '&', ':', ']', '[', 'return', '==', '1', 'NULL', 'int', 'i', 'struct', '!', "'", 'case', '!=', '+', '-', 'else', 'break', '&&', 'char', '<', 'p', '++', 's', 'goto', 'image', 'void', 'const', '\\n', 'ret', '', '||', 'unsigned', '>', 'sizeof', '2', 'data', 'len', 'static', 'c', 'for', 'err', 'r', '+=', 'buf', ' ', '-1', 'n', 'type', 'flags', '<<', 'ctxt', 'name', 'x', '4', 'e', 'sk', 'error', 'skb', 'state', 'size', '|', 't', 'ctx', 'offset', 'dev', '?', 'y', '3', 'value', 'length', 'a', 'long', 'status', 'result', '>=', 'out', 'q', 'j', 'rc', 'size_t', 'while', '8', '/', 'file', 'buffer']
Embeddings for some words: {'word1': array([ 0.92124754, -5.12175   , -0.41359103,  0.13340321, -1.1364312 ,
       -0.38334224, -1.5587597 , -0.83388484,  3.5488493 ,  2.6596568 ,
       -0.919841  ,  0.25876546,  0.7285957 ,  0.18624324, -0.41903734,
       -0.9764415 , -0.5622702 ,  1.84