In [22]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

import math
import re
from collections import Counter
import pandas as pd

In [23]:
new_instance_script = 'Hey'

In [24]:
# new_instance_script

In [25]:
def clean_script(script):
    # Remove comments from the script
    script = re.sub(r'#.*$', ' ', script, flags=re.MULTILINE)
    
    # Remove special characters and reduce consecutive spaces
    cleaned_script = re.sub(r'\s+', ' ', script)
    
    # Remove tabs and newlines
    cleaned_script = cleaned_script.replace('\t', ' ').replace('\n', ' ')
    
    # Remove punctuation
    cleaned_script = re.sub(r'[^\w\s]', ' ', cleaned_script).lower()
    
    return cleaned_script

In [26]:
clean = clean_script(new_instance_script)

In [27]:
# clean

In [28]:
import pickle
import pandas as pd

# Load the TF-IDF vectorizer
with open('vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
# Load the SelectKBest instance
with open('selector.pkl', 'rb') as file:
    selector = pickle.load(file)
# Load the selected feature names DataFrame
X_selected_df = pd.read_pickle('selected_features.pkl')
# Assuming 'new_data' contains your new data
# Apply the TF-IDF vectorizer to the new data
X_tfidf = vectorizer.transform([clean])
# Apply the loaded SelectKBest instance to the TF-IDF transformed data
X_new_selected = selector.transform(X_tfidf)
# Convert the selected features to a DataFrame using the previously selected feature names
X_new_selected_df = pd.DataFrame(X_new_selected.toarray(), columns=X_selected_df.columns)

In [29]:
X_new_selected_df

Unnamed: 0,0x00,0x00 0x00,0x00 0x00 0x00,0x00 0x00 0x60,0x00 0x00 0x68,0x00 0x10,0x00 0x10 0x00,0x00 0x56,0x00 0x60,0x00 0x60 0x89,...,wow64 trident,wow64 trident rv,write,write error,write host,write verbose,write verbose message,yahoo,yahoo csrsv,yahoo csrsv exe
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Function definitions (copied from your training notebook)
def text_length(script):
    return len(script)

def entropy(script):
    character_counts = Counter(script)
    total_characters = len(script)
    probabilities = [count / total_characters for count in character_counts.values()]
    entropy_value = -sum(probability * math.log2(probability) for probability in probabilities)
    return entropy_value

def punctuation_count(script):
    return len(re.findall(r'[^\w\s]', script))

def function_count(script):
    function_keywords = ['function', 'procedure']
    return sum(script.count(keyword) for keyword in function_keywords)

def numeric_literal_count(script):
    return len(re.findall(r'\b\d+\b', script))

def string_literal_count(script):
    return len(re.findall(r'"([^"]*)"', script))

def has_error_handling(script):
    error_handling_keywords = ['try', 'except', 'catch']
    return any(keyword in script for keyword in error_handling_keywords)

def has_urls_or_ips(script):
    return bool(re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|\d+\.\d+\.\d+\.\d+', script))

def has_obfuscation_indicators(script):
    obfuscation_patterns = [
        r'\b(?:\w+\s*\+\s*\w+)',
        r'\b(?:[a-zA-Z]\s*=\s*[^;]*\bchr\s*\(\s*\w+\s*\+\s*\d+\s*\)\s*;\s*)+',
        r'0x[\da-fA-F]+',
        r'(?:\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})',
        r'\b(?:Add-Type|dllimport|virtualalloc|createthread|memset)\b',
        r'\b(?:eval|exec|decode|encode|obfuscate)\b'
    ]
    return any(re.search(pattern, script) for pattern in obfuscation_patterns)

def has_suspicious_words(script):
    disclosure_keywords = ['downloadfile','password', 'secret', 'key', 'token', 'downloadstring',
                          'dllimport', 'programdata', 'new object', 'appdata']
    return any(keyword in script for keyword in disclosure_keywords)

def longest_string_length(script):
    string_literals = re.findall(r'"([^"]*)"', script)
    if not string_literals:
        return 0
    longest_length = max(len(string_literal) for string_literal in string_literals)
    return longest_length


length = text_length(clean)
ent = entropy(new_instance_script)
punc_count = punctuation_count(new_instance_script)
func_count = function_count(clean)
num_lit_count = numeric_literal_count(clean)
str_lit_count = string_literal_count(new_instance_script)
err_handling = has_error_handling(clean)
urls_ips = has_urls_or_ips(new_instance_script)
obf_indicators = has_obfuscation_indicators(clean)
suspicious_words = has_suspicious_words(clean)
longest_str_length = longest_string_length(new_instance_script)

# Create a DataFrame with the extracted features
new_features = pd.DataFrame({
    'text_length': [length],
    'function_count': [func_count],
    'numeric_literal_count': [num_lit_count],
    'has_error_handling': [int(err_handling)], 
    'has_obfuscation_indicators': [int(obf_indicators)],
    'has_suspicious_words': [int(suspicious_words)],  
    
    'Entropy': [ent],
    'punctuation_count': [punc_count],
    'longest_string_length': [longest_str_length],
    'string_literal_count': [str_lit_count],
    'has_urls_or_ips': [int(urls_ips)]       
})

# Concatenate the new features DataFrame with the existing DataFrame containing selected features
X_new_selected_df_with_features = pd.concat([X_new_selected_df, new_features], axis=1)

In [31]:
X_new_selected_df_with_features

Unnamed: 0,0x00,0x00 0x00,0x00 0x00 0x00,0x00 0x00 0x60,0x00 0x00 0x68,0x00 0x10,0x00 0x10 0x00,0x00 0x56,0x00 0x60,0x00 0x60 0x89,...,function_count,numeric_literal_count,has_error_handling,has_obfuscation_indicators,has_suspicious_words,Entropy,punctuation_count,longest_string_length,string_literal_count,has_urls_or_ips
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1.584963,0,0,0,0


In [32]:
# Load the trained model
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

# Use the model to make predictions
predicted_label = model.predict(X_new_selected_df_with_features)

In [33]:
if predicted_label[0] == 0:
    print("Malicious")
else:
    print("Benign")

Benign
