In [1]:
import pandas as pd

# Replace 'your_file_path.parquet' with the actual path to your Parquet file
file_path = "client01_dnsmasq_for_spark.csv"

# Read Parquet file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the DataFrame
# print(df)
df

Unnamed: 0,key,epoch_timestamp,encoded_key,label,value1,value2
0,query[A] <*> from <*>,1642179603,2,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,10.35.33.111
1,forwarded <*> to <*>,1642179603,1,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,192.168.255.254
2,reply <*> is <*>,1642179603,0,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,195.128.194.168
3,query[A] <*> from <*>,1642179620,2,1,3x6-.547-.WharXpRiFOnbAvznOFBIiR4EDr2FH97sAZEw...,10.35.33.111
4,forwarded <*> to <*>,1642179620,1,1,3x6-.547-.WharXpRiFOnbAvznOFBIiR4EDr2FH97sAZEw...,192.168.255.254
...,...,...,...,...,...,...
240577,reply <*> is <*>,1642611336,0,0,database.clamav.net,<CNAME>
240578,reply <*> is <*>,1642611336,0,0,database.clamav.net.cdn.cloudflare.net,2606:4700::6810:db54
240579,reply <*> is <*>,1642611336,0,0,database.clamav.net.cdn.cloudflare.net,2606:4700::6810:da54
240580,query[AAAA] <*> from <*>,1642611417,3,0,mail,172.17.131.81


In [2]:
import math

In [3]:
def categorize_ip(value2):
    try:
        # Split IP address into octets
        octets = list(map(int, value2.split('.')))

        # Check if IPv4 is private
        if (octets[0] == 10) or (octets[0] == 172 and 16 <= octets[1] <= 31) or (octets[0] == 192 and octets[1] == 168):
            return 0  # IPv4 private
        else:
            return 1  # IPv4 public
    except:
        try:
            # Split IP address into parts
            parts = value2.split(':')

            # Check if IPv6 is private
            if parts[0] == 'fd':
                return 2  # IPv6 private
            else:
                # Check if parts[1] has a value or an empty string
                if parts[2] or parts[2] == '':
                    return 3  # IPv6 public
                else:
                    return 4  # Non-IP values
        except:
            return 4  # Non-IP values

    
def count_dots(s):
    return s.count('.')

def count_hyphens(s):
    return s.count('-')

def count_slash(s):
    return s.count('/')

def count_asterisk(s):
    return s.count('*')

def count_capitals(value):
    return sum(1 for c in value if c.isupper())

def has_microsoft_extension(value):
    microsoft_extensions = ['doc', 'docx', 'odt', 'pages', 'rtf', 'txt', 'wpd', 'wps',
                            'csv', 'numbers', 'ods', 'xls', 'xlsx',
                            'asp', 'aspx', 'css', 'htm', 'html', 'jsp', 'php', 'xml',
                            'afdesign', 'ai', 'cad', 'cdr', 'drw', 'dwg', 'eps', 'odg', 'svg', 'vsdx',
                            'afpub', 'indd', 'pdf', 'pdfxml', 'pmd', 'pub', 'qxp',
                            'c', 'cpp', 'cs', 'java', 'js', 'json', 'py', 'sql', 'swift', 'vb',
                            '7z', 'rar', 'tar', 'tar.gz', 'zip',
                            'bak', 'cfg', 'conf', 'ini', 'msi', 'sys', 'tmp',
                            'app', 'bat', 'bin', 'cmd', 'com', 'exe', 'vbs', 'x86']
    words = value.split('.')
    for word in words:
        if word.lower() in microsoft_extensions:
            return 1
    return 0


def calculate_entropy(value):
    counts = {}
    total_count = 0
    for c in value:
        counts[c] = counts.get(c, 0) + 1
        total_count += 1
    probabilities = [count / total_count for count in counts.values()]
    entropy_value = sum(-p * math.log2(p) for p in probabilities)
    return entropy_value

def is_human_readable(entropy_value, threshold=5.0):
    return 1 if entropy_value < threshold else 0


In [4]:
# Apply functions to DataFrame columns


df['encoded_key'] = df['encoded_key'].astype(int)
df['key_length'] = df['key'].apply(len)
df['value1_length'] = df['value1'].apply(len)
df['value2_length'] = df['value2'].apply(len)

df['value1_dot_count'] = df['value1'].apply(count_dots)
df['value1_hyphen_count'] = df['value1'].apply(count_hyphens)
df['value1_slash_count'] = df['value1'].apply(count_slash)
df['value1_asterisk_count'] = df['value1'].apply(count_asterisk)
df['value1_capital_count'] = df['value1'].apply(count_capitals)
df['value1_has_file_extensions'] = df['value1'].apply(has_microsoft_extension)
df['entropy'] = df['value1'].apply(calculate_entropy)
df['value1_human_readable'] = df['entropy'].apply(is_human_readable)

df['value2_dot_count'] = df['value2'].apply(count_dots)
df['value2_hyphen_count'] = df['value2'].apply(count_hyphens)
df['value2_slash_count'] = df['value2'].apply(count_slash)
df['value2_asterisk_count'] = df['value2'].apply(count_asterisk)
df['value2_capital_count'] = df['value2'].apply(count_capitals)
df['value2_has_file_extensions'] = df['value2'].apply(has_microsoft_extension)
df['value2_ip_class'] = df['value2'].apply(categorize_ip)


# Display the DataFrame
df

Unnamed: 0,key,epoch_timestamp,encoded_key,label,value1,value2,key_length,value1_length,value2_length,value1_dot_count,...,value1_has_file_extensions,entropy,value1_human_readable,value2_dot_count,value2_hyphen_count,value2_slash_count,value2_asterisk_count,value2_capital_count,value2_has_file_extensions,value2_ip_class
0,query[A] <*> from <*>,1642179603,2,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,10.35.33.111,21,233,12,8,...,1,5.845176,0,3,0,0,0,0,0,0
1,forwarded <*> to <*>,1642179603,1,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,192.168.255.254,20,233,15,8,...,1,5.845176,0,3,0,0,0,0,0,0
2,reply <*> is <*>,1642179603,0,1,3x6-.546-.2PoxC1PkS*qtk0p2kKZGSYsWe2X*u678tHnP...,195.128.194.168,16,233,15,8,...,1,5.845176,0,3,0,0,0,0,0,1
3,query[A] <*> from <*>,1642179620,2,1,3x6-.547-.WharXpRiFOnbAvznOFBIiR4EDr2FH97sAZEw...,10.35.33.111,21,233,12,8,...,1,5.861942,0,3,0,0,0,0,0,0
4,forwarded <*> to <*>,1642179620,1,1,3x6-.547-.WharXpRiFOnbAvznOFBIiR4EDr2FH97sAZEw...,192.168.255.254,20,233,15,8,...,1,5.861942,0,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240577,reply <*> is <*>,1642611336,0,0,database.clamav.net,<CNAME>,16,19,7,2,...,0,3.321104,1,0,0,0,0,5,0,4
240578,reply <*> is <*>,1642611336,0,0,database.clamav.net.cdn.cloudflare.net,2606:4700::6810:db54,16,38,20,5,...,0,3.698089,1,0,0,0,0,0,0,3
240579,reply <*> is <*>,1642611336,0,0,database.clamav.net.cdn.cloudflare.net,2606:4700::6810:da54,16,38,20,5,...,0,3.698089,1,0,0,0,0,0,0,3
240580,query[AAAA] <*> from <*>,1642611417,3,0,mail,172.17.131.81,24,4,13,0,...,0,2.000000,1,3,0,0,0,0,0,0


In [6]:
# Assuming 'df' is your DataFrame and 'label' is the column containing labels
label_counts = df['label'].value_counts()

# Print distinct labels and their counts
print("Distinct labels and their counts:")
print(label_counts)


Distinct labels and their counts:
label
0    195982
1     44600
Name: count, dtype: int64


In [7]:
list_of_columns = [
    'encoded_key',
    'key_length',
    'value1_length',
    'value2_length',
    'value1_dot_count',
    'value1_hyphen_count',
    'value1_slash_count',
    'value1_asterisk_count',
    'value1_capital_count',
    'value1_has_file_extensions',
    'value1_human_readable',
    'value2_dot_count',
    'value2_hyphen_count',
    'value2_slash_count',
    'value2_asterisk_count',
    'value2_capital_count',
    'value2_has_file_extensions',
    'value2_ip_class',
    'label'
]

# Assuming df is your DataFrame
df = df[list_of_columns]

df_label = df['label'].copy()

df = df.drop(columns=['label'])

df

Unnamed: 0,encoded_key,key_length,value1_length,value2_length,value1_dot_count,value1_hyphen_count,value1_slash_count,value1_asterisk_count,value1_capital_count,value1_has_file_extensions,value1_human_readable,value2_dot_count,value2_hyphen_count,value2_slash_count,value2_asterisk_count,value2_capital_count,value2_has_file_extensions,value2_ip_class
0,2,21,233,12,8,5,3,3,67,1,0,3,0,0,0,0,0,0
1,1,20,233,15,8,5,3,3,67,1,0,3,0,0,0,0,0,0
2,0,16,233,15,8,5,3,3,67,1,0,3,0,0,0,0,0,1
3,2,21,233,12,8,5,3,3,84,1,0,3,0,0,0,0,0,0
4,1,20,233,15,8,5,3,3,84,1,0,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240577,0,16,19,7,2,0,0,0,0,0,1,0,0,0,0,5,0,4
240578,0,16,38,20,5,0,0,0,0,0,1,0,0,0,0,0,0,3
240579,0,16,38,20,5,0,0,0,0,0,1,0,0,0,0,0,0,3
240580,3,24,4,13,0,0,0,0,0,0,1,3,0,0,0,0,0,0


# model traingin

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# from sklearn.ensemble import IsolationForest
# from sklearn.svm import OneClassSVM
# from sklearn.cluster import DBSCAN
# from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
# from pyod.models.hbos import HBOS

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
import numpy as np

from sklearn.metrics import classification_report
import time
import pickle

In [9]:
import joblib

def save_model(model, filename):
    """
    Save a scikit-learn model to a file using joblib.

    Parameters:
    - model: The trained scikit-learn model to be saved.
    - filename: The filename to save the model to.
    """
    joblib.dump(model, filename)
    print(f"Model saved to {filename}")

def load_model(filename):
    """
    Load a scikit-learn model from a file using joblib.

    Parameters:
    - filename: The filename from which to load the model.

    Returns:
    - The loaded scikit-learn model.
    """
    loaded_model = joblib.load(filename)
    print(f"Model loaded from {filename}")
    return loaded_model


# # Save the model
# save_model(model, 'your_model_filename.pkl')

# # Load the model
# model = load_model('your_model_filename.pkl')

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_and_format_metrics(y_true, y_pred):
    # Calculate individual metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Format metrics as percentages with 2 decimal digits
    accuracy_percentage = "{:.2f}%".format(accuracy * 100)
    precision_percentage = "{:.2f}%".format(precision * 100)
    recall_percentage = "{:.2f}%".format(recall * 100)
    f1_percentage = "{:.2f}%".format(f1 * 100)

    return accuracy_percentage, precision_percentage, recall_percentage, f1_percentage


In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df,
    df_label,
    test_size=0.2,
    random_state=42
)

In [12]:
# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

# Training
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time

# Testing and evaluation
start_time = time.time()
y_pred = model.predict(X_test)
prediction_time = time.time() - start_time

# Confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print(f"Training Time: {training_time:.4f} seconds")
print(f"Prediction Time: {prediction_time:.4f} seconds")
print("\nConfusion Matrix:")
print(conf_matrix)

accuracy_percentage, precision_percentage, recall_percentage, f1_percentage = calculate_and_format_metrics(y_test, y_pred)

print("\n")
print("Accuracy:", accuracy_percentage)
print("Precision:", precision_percentage)
print("Recall:", recall_percentage)
print("F1 Score:", f1_percentage)

print("\nClassification Report:")
print(class_report)

save_model(model, 'model/DNS_rf_model.pkl')


Training Time: 5.9019 seconds
Prediction Time: 0.2451 seconds

Confusion Matrix:
[[39221     7]
 [   55  8834]]


Accuracy: 99.87%
Precision: 99.92%
Recall: 99.38%
F1 Score: 99.65%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39228
           1       1.00      0.99      1.00      8889

    accuracy                           1.00     48117
   macro avg       1.00      1.00      1.00     48117
weighted avg       1.00      1.00      1.00     48117

Model saved to model/DNS_rf_model.pkl
