In [1]:
from sklearn.decomposition import PCA
import re
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
import pickle
import pandas as pd

In [2]:
obfuscation_indicators = [
    'Chr(', 'Hex(', 'Execute(', 'Base64Decode(',
    'Environ(', 'Shell(', 'WScript.Shell', 'GetFile(', 'MSXML2.ServerXMLHTTP', 
    'CreateObject("MSXML2.XMLHTTP")', 'CreateObject("ADODB.Stream")', 
    'Randomize', 'Replace(', 'StrReverse(', 'CallByName(', 'GetObject(',
    'Xor', 'vbuicode', 'vbfromunicode', 'If False Then ... End If'
    ]

def is_obfuscated(code):
    for indicator in obfuscation_indicators:
        if indicator in code:
            return 1
    return 0

In [3]:
def has_url_or_ip(code):
    # Regular expression pattern to match URLs or IPs
    pattern = re.compile(r'(https?://|ftp://|www\.)|((25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[0-1]?[0-9][0-9]?)')

    # Check if the pattern is found in the code
    return 1 if pattern.search(code) else 0

In [4]:
information_disclosure_keywords = ['winmgmts', 'Win32_Process', 'shell', 
                                   'ssn', 'secret', 'pwd', 'shadow', 'bypass', 
                                  'EvilClippy', 'Base64Decode', 'CreateObject',
                                  'CommandLine', 'Auto_open', 'Replace', 'Based',
                                  'Worksheet_Change', 'bin.base64', 'xor', 'GetFile',
                                  'Shell', 'DownloadFile', 'Eval']

# Define a function to check for the presence of information disclosure keywords
def has_information_disclosure(code):
    for keyword in information_disclosure_keywords:
        if keyword in code.lower():
            return 1
    return 0

In [5]:
def set_data(df):
    df['code_length'] = df['vba_code'].apply(len)
    df['num_lines'] = df['vba_code'].apply(lambda x: x.count('\n') + 1)
    df['avg_chars_per_line'] = df.apply(lambda row: row['code_length'] / row['num_lines'] if row['num_lines'] > 0 else 0, axis=1)
    df['num_loops'] = df['vba_code'].apply(lambda x: x.count('For') + x.count('While'))
    df['has_error_handling'] = df['vba_code'].apply(lambda x: 1 if 'On Error' in x else 0)
    df['has_url_or_ip'] = df['vba_code'].apply(has_url_or_ip)
    df['is_obfuscated'] = df['vba_code'].apply(is_obfuscated)
    df['num_string_literals'] = df.apply(lambda row: len(re.findall(r'"([^"]*)"', row['vba_code'])) / row['code_length'] if row['code_length'] > 0 else 0, axis=1)
    df['has_information_disclosure'] = df['vba_code'].apply(has_information_disclosure)
    df['num_numeric_literals'] = df.apply(lambda row: len(re.findall(r'\b\d+\b', row['vba_code'])) / row['code_length'] if row['code_length'] > 0 else 0, axis=1)
    df['num_exclamation_marks'] = df.apply(lambda row: row['vba_code'].count('!') / row['code_length'] if row['code_length'] > 0 else 0, axis=1)
    df['num_functions'] = df.apply(lambda row: (row['vba_code'].count('Sub ') + row['vba_code'].count('Function ')) / row['code_length'] if row['code_length'] > 0 else 0, axis=1)
    
    df = df.drop('vba_code', axis=1)
    return df

In [6]:
def aggregate_word_vectors(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return [0] * model.vector_size  
    return sum(vectors) / len(vectors)  

In [7]:
def load_and_set_data(file):
    # load
    df = pd.read_csv(file, encoding='utf-16-le')
    
    # set features
    set_df = set_data(df)
    
    # set Word2Vec
    tokenized_data = [doc.split() for doc in df['vba_code']]
    word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=160, window=5, min_count=1, workers=4)    
    X_word2vec = [aggregate_word_vectors(tokens, word2vec_model) for tokens in tokenized_data]
    X_word2vec_df = pd.DataFrame(X_word2vec, columns=[f'w2v_{i}' for i in range(len(X_word2vec[0]))])
    
    # set PCA
    pca = PCA()
    pca = PCA(n_components=121)
    
    X_reduced = pca.fit_transform(X_word2vec_df)
    X_PCA = pd.DataFrame(X_reduced)
    X_combined_pca = pd.concat([X_PCA, set_df], axis=1)
    X_combined_pca.columns = X_combined_pca.columns.astype(str)
    return X_combined_pca

In [8]:
df = load_and_set_data("test_dataset_without_labels.csv")
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,avg_chars_per_line,num_loops,has_error_handling,has_url_or_ip,is_obfuscated,num_string_literals,has_information_disclosure,num_numeric_literals,num_exclamation_marks,num_functions
0,1.174257,0.179959,0.112019,-3.123223,-2.660456,0.855669,0.042135,1.015783,0.503182,-0.059466,...,26.118143,0,0,0,0,0.016963,0,0.009047,0.000000,0.003231
1,1.229868,-0.458853,-1.584756,-0.493038,0.020424,0.596116,-0.312482,-1.746804,-0.244379,-0.642178,...,33.300000,5,0,0,0,0.007007,0,0.001001,0.000000,0.004004
2,0.185233,-0.427722,-3.732536,2.663929,-1.407725,0.975898,0.035266,0.121326,0.563626,1.272761,...,35.900000,0,1,0,0,0.002786,0,0.015320,0.000000,0.001393
3,1.294109,-1.467201,-2.349654,1.225589,-0.929708,0.666835,0.884377,-0.819976,-0.823506,1.248593,...,26.995633,5,0,0,0,0.014882,0,0.004368,0.001779,0.002103
4,0.868445,0.848054,2.641923,-2.871583,-0.104548,0.975064,1.976400,-1.397536,-2.271486,1.219427,...,27.731959,0,0,0,0,0.001859,0,0.000000,0.000000,0.005204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10625,1.801852,2.717743,-1.101528,-0.758904,1.981783,-0.291990,0.395966,-0.603166,-0.993923,-0.994916,...,43.875000,0,0,0,0,0.002849,0,0.000000,0.000000,0.002849
10626,0.449360,1.886376,0.566060,-2.488095,0.403791,0.897340,0.057267,-1.433963,-0.810635,-1.123994,...,19.176471,0,0,1,0,0.006135,1,0.003067,0.000000,0.004601
10627,2.200671,-1.572656,-1.767017,-1.643230,-2.001048,-0.723247,-2.086287,-1.743599,0.552948,-0.155166,...,27.027027,0,0,0,0,0.008000,0,0.004000,0.000000,0.003000
10628,2.132501,4.344479,0.654824,0.987394,2.021957,0.596442,-3.246736,1.261032,1.102048,-3.662917,...,71.428571,0,0,0,0,0.013000,0,0.011000,0.000000,0.000000


### load model from pickle file

In [12]:
model_pkl_file = "model.pkl" 

with open(model_pkl_file, 'rb') as file:  
    model = pickle.load(file)

In [13]:
prediction = model.predict(df)
prediction

array([1, 1, 1, ..., 1, 1, 1])

In [14]:
prediction_labels = ['white' if pred == 1 else 'mal' for pred in prediction]
import csv

# Specify the file path
csv_file_path = 'test_prediction.csv'

# Open the CSV file in write mode
with open(csv_file_path, 'w', newline='') as csvfile:
    # Create a CSV writer
    csv_writer = csv.writer(csvfile)
    
    # Write the header if needed
    csv_writer.writerow(['prediction'])  
    
    # Write the prediction labels to the CSV file
    csv_writer.writerows([[label] for label in prediction_labels])

print(f'Predictions have been written to {csv_file_path}.')

Predictions have been written to test_prediction.csv.
