# Import libraries

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px

from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, count, when, lit
from pyspark.sql import functions as F

import plotly.express as px
import plotly.graph_objects as go

import requests
import ast


In [None]:
spark = SparkSession.builder.appName('Malware').getOrCreate()

# **Loading datasets**

## Malware DNS dataset

In [None]:
df_1 = spark.read.json('2018.log')
df_2 = spark.read.json('2019.log')
df_3 = spark.read.json('2020.log')
df_4 = spark.read.json('2021.log')
df_5 = spark.read.json('2022.log')
df_6 = spark.read.json('2023.log')
df_7 = spark.read.json('2024.log')

In [None]:
df_1 = df_1['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_2 = df_2['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_3 = df_3['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_4 = df_4['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_5 = df_5['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_6 = df_6['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_7 = df_7['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']
df_1.show()

In [None]:
df_1 = df_1.withColumn("year", lit(2018))
df_2 = df_2.withColumn("year", lit(2019))
df_3 = df_3.withColumn("year", lit(2020))
df_4 = df_4.withColumn("year", lit(2021))
df_5 = df_5.withColumn("year", lit(2022))
df_6 = df_6.withColumn("year", lit(2023))
df_7 = df_7.withColumn("year", lit(2024))

df_1.show()

In [None]:
df_m = df_1.union(df_2).union(df_3).union(df_4).union(df_5).union(df_6).union(df_7)
df_m.show()

In [None]:
# add id column
df_m = df_m.withColumn("id", monotonically_increasing_id())
df_m = df_m.withColumn("id", df_m["id"] + 1)

# add malware columns
df_m = df_m.withColumn("malware", lit(1))

df_m.show()

In [None]:
df_m = df_m['year', 'id', 'icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy', 'malware']
df_m.show()

In [None]:
print(f" 2018 count is: {df_1.count()} \n 2019 count is: {df_2.count()} \n 2020 count is: {df_3.count()} \n 2021 count is: {df_4.count()} \n 2022 count is: {df_5.count()} \n 2023 count is: {df_6.count()} \n 2024 count is: {df_7.count()}")

In [None]:
print(f"malware df count is {df_m.count()}")

## Normal DNS dataset

In [None]:
df_n = spark.read.json('top-1m-dns.log')

df_n = df_n[df_n["`id.resp_h`"] == "8.8.8.8"]
df_n = df_n['icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy']

df_n.show()

In [None]:
df_n = df_n.withColumn("year", lit(0))
df_n = df_n.withColumn("id", lit(0))
df_n = df_n.withColumn("malware", lit(0))

df_n = df_n['year', 'id', 'icann_tld', 'icann_domain', 'query', 'length', 'qtype_name', 'rcode_name', 'Z', 'rtt', 'answers', 'TTLs', 'entropy', 'malware']

df_n.show()

# **data cleaning**

In [None]:
df_n.printSchema()

In [None]:
df_m.printSchema()

In [19]:
df_m = df_m.join(df_n, df_m['icann_domain'] == df_n['icann_domain'], how='left_anti')
df_m = df_m.join(df_n, df_m['query'] == df_n['query'], how='left_anti')

In [None]:
df_final = df_n.union(df_m)
df_final.count()

In [21]:
df_final = df_n.union(df_m)

In [None]:
df_final.count()

In [19]:
# df_final = df_final.toPandas()
# df_final

In [None]:
df_final.select(count(when(col("icann_domain").isNull(), 1)).alias("null_count_in_name")).show()

In [None]:
df_final.select(count(when(col("query").isNull(), 1)).alias("null_count_in_name")).show()

In [None]:
df_final = df_final.dropna(subset="icann_domain")
df_final.count()

In [None]:
df_m = df_final.filter(df_final['malware'] == 1)
df_n = df_final.filter(df_final['malware'] != 1)

print(f" malware count: {df_m.count()} \n normal count: {df_n.count()}")

In [None]:
for col in df_n['qtype_name', 'rcode_name', 'Z']:
    print(df_n.groupBy(col).count().show())

In [None]:
for col in df_m['qtype_name', 'rcode_name', 'Z']:
    print(df_m.groupBy(col).count().show())

In [26]:
# # Step 1: Compute value counts for 'rcode_name' grouped by 'malware'
# value_counts_df = (
#     df_final.groupBy("rcode_name", "malware")
#     .agg(count("*").alias("count"))
#     .orderBy("rcode_name", "malware")
# )

# # Step 2: Convert PySpark DataFrame to Pandas for plotting
# value_counts_pd = value_counts_df.toPandas()

# # Step 3: Pivot the data for grouped bar chart format
# pivot_data = value_counts_pd.pivot(index="rcode_name", columns="malware", values="count").fillna(0)

# # Ensure proper column names
# pivot_data.columns = ["Malware_0", "Malware_1"]

# # Reset index for plotting
# pivot_data.reset_index(inplace=True)

# # Step 4: Create grouped bar chart using Plotly
# fig = go.Figure()

# # Add bars for 'malware = 0' (blue)
# fig.add_trace(go.Bar(
#     x=pivot_data["rcode_name"],
#     y=pivot_data["Malware_0"],
#     name="Malware = 0",
#     marker_color="blue"
# ))

# # Add bars for 'malware = 1' (red)
# fig.add_trace(go.Bar(
#     x=pivot_data["rcode_name"],
#     y=pivot_data["Malware_1"],
#     name="Malware = 1",
#     marker_color="red"
# ))

# # Customize layout
# fig.update_layout(
#     title="Bar Chart of rcode_name Counts by Malware Status",
#     xaxis_title="rcode_name",
#     yaxis_title="Count",
#     barmode="group",
#     template="plotly_white"
# )

# # Show the plot
# fig.show()


In [None]:
df_m.groupBy('rtt').count().orderBy("count", ascending=False).show()

In [28]:
# # Assuming df is your PySpark DataFrame and you have the relevant columns 'malware' and 'rtt'
# # Convert PySpark DataFrame to Pandas DataFrame
# data_pd = df_final.select("malware", "rtt").toPandas()

# # Filter the data where 'rtt' > 0.75
# data_filtered = data_pd[data_pd['rtt'] > 0.75]

# # Create the boxplot using Plotly
# fig = px.box(
#     data_filtered, 
#     x="malware", 
#     y="rtt", 
#     title="Boxplot of RTT > 0.75 vs Malware",
#     labels={"malware": "Malware", "rtt": "RTT (ms)"}
# )

# # Show the plot
# fig.show()


In [None]:
df_m.sample(fraction=0.2, seed=42).show()

# **Analysis**

## *TTL*

### preprocessing

In [None]:
df_ttl = df_final['icann_domain', 'TTLs', 'malware']
df_ttl.show()

In [None]:
print(f"count of TTL bedfor drop nulls: {df_ttl.count()}")

In [None]:
print(f"number of null values in normals:   {df_ttl.filter(df_ttl['TTLs'].isNull() & (df_ttl['malware'] == 0)).count()}")
print(f"number of null values in malwares:  {df_ttl.filter(df_ttl['TTLs'].isNull() & (df_ttl['malware'] == 1)).count()}")

In [38]:
# drop nulls
df_ttl = df_ttl.dropna()

In [None]:
print(f"count of TTL after drop nulls: {df_ttl.count()}")

### unique

In [None]:
exploded_df = df_ttl.withColumn("TTL", F.explode("TTLs"))

df_ttl = exploded_df.groupBy('icann_domain', 'malware') \
                       .agg(F.sort_array(F.collect_list('TTL')).alias('sorted_TTLs'))

df_ttl.show()

In [None]:
df_ttl.sample(fraction=0.2, seed=42).show(50)

In [None]:
df_ttl.groupBy('icann_domain').count().orderBy(F.col('count').desc()).show()

In [None]:
print(f"number of normal values:  {df_ttl.filter(df_ttl['malware'] != 1).count()}")
print(f"number of malware values: {df_ttl.filter(df_ttl['malware'] == 1).count()}")

### new ttl dataframe 

In [45]:
df_ttl_result = df_ttl.withColumn(
    "min", F.expr("array_min(sorted_TTLs)")  # Get the minimum TTL
).withColumn(
    "max", F.expr("array_max(sorted_TTLs)")  # Get the maximum TTL
).withColumn(
    "mean", F.expr("aggregate(sorted_TTLs, 0D, (acc, x) -> acc + x) / size(sorted_TTLs)")  # Get the mean of TTLs
)

# Select the desired columns
df_ttl_result = df_ttl_result.select('icann_domain', 'sorted_TTLs', 'min', 'max', 'mean', 'malware')

In [None]:
# show random rows
df_ttl_result.sample(fraction=0.2, seed=42).show()

### TTL eda

In [None]:
df_ttl_pd = df_ttl_result.toPandas()

# Add a "Group" column to distinguish malware categories
df_ttl_pd['Group'] = df_ttl_pd['malware'].apply(lambda x: 'Malware' if x == 1 else 'Normal')

# Create a boxplot with Plotly Express
fig = px.box(
    df_ttl_pd,
    x='Group',
    y='mean',
    title='Distribution of Mean Values by Malware Status',
    labels={'mean': 'Mean Value', 'Group': 'Malware Status'}
)

# Show the plot
fig.show()

In [None]:
df_ttl_filtered = df_ttl_result.filter(df_ttl_result['mean'] < 30000)

# Convert the filtered PySpark DataFrame to Pandas
df_ttl_filtered_pd = df_ttl_filtered.toPandas()

# Add a "Group" column to distinguish malware categories
df_ttl_filtered_pd['Group'] = df_ttl_filtered_pd['malware'].apply(lambda x: 'Malware' if x == 1 else 'Normal')

# Create a boxplot with Plotly Expressfence
fig = px.box(
    df_ttl_filtered_pd,
    x='Group',
    y='mean',
    title='Distribution of Mean Values by Malware Status (Under 30k)',
    labels={'mean': 'Mean Value', 'Group': 'Malware Status'}
)

# Show the plot
fig.show()

## *RTT*

### preprocessing

In [None]:
df_rtt = df_final['icann_domain', 'rtt', 'malware']
df_rtt.show()

In [None]:
print(f"count of TTL after drop nulls: {df_rtt.count()}")

In [None]:
print(f"number of null values in normals:   {df_rtt.filter(df_rtt['rtt'].isNull() & (df_rtt['malware'] == 0)).count()}")
print(f"number of null values in malwares:  {df_rtt.filter(df_rtt['rtt'].isNull() & (df_rtt['malware'] == 1)).count()}")

In [42]:
# drop nulls
df_rtt = df_rtt.dropna()

In [None]:
print(f"count of TTL after drop nulls: {df_rtt.count()}")

### unique

In [None]:
df_rtt = df_rtt.groupBy('icann_domain', 'malware') \
                      .agg(F.sort_array(F.collect_list('rtt')).alias('sorted_rtt'))

# Show the result
df_rtt.show()

In [None]:
df_rtt.groupBy('icann_domain').count().orderBy(F.col('count').desc()).show()

In [None]:
print(f"number of normal values:  {df_rtt.filter(df_rtt['malware'] != 1).count()}")
print(f"number of malware values: {df_rtt.filter(df_rtt['malware'] == 1).count()}")

### new rtt dataframe 

In [47]:
df_rtt_result = df_rtt.withColumn(
    "min", F.expr("array_min(sorted_rtt)")  # Get the minimum TTL
).withColumn(
    "max", F.expr("array_max(sorted_rtt)")  # Get the maximum TTL
).withColumn(
    "mean", F.expr("aggregate(sorted_rtt, 0D, (acc, x) -> acc + x) / size(sorted_rtt)")  # Get the mean of TTLs
)

# Select the desired columns
df_rtt_result = df_rtt_result.select('icann_domain', 'sorted_rtt', 'min', 'max', 'mean', 'malware')

In [None]:
# show random rows
df_rtt_result.sample(fraction=0.2, seed=42).show()

### RTT eda

In [None]:
df_rtt_pd = df_rtt_result.toPandas()

# Add a "Group" column to distinguish malware categories
df_rtt_pd['Group'] = df_rtt_pd['malware'].apply(lambda x: 'Malware' if x == 1 else 'Normal')

# Create a boxplot with Plotly Express
fig = px.box(
    df_rtt_pd,
    x='Group',
    y='mean',
    title='RTT values',
    labels={'mean': 'Mean Value', 'Group': 'Malware Status'}
)

# Show the plot
fig.show()

In [None]:
df_rtt_filtered = df_rtt_result.filter(df_rtt_result['mean'] < 0.75)

# Convert the filtered PySpark DataFrame to Pandas
df_rtt_filterd_pd = df_rtt_filtered.toPandas()

# Add a "Group" column to distinguish malware categories
df_rtt_filterd_pd['Group'] = df_rtt_filterd_pd['malware'].apply(lambda x: 'Malware' if x == 1 else 'Normal')

# Create a boxplot with Plotly Express
fig = px.box(
    df_rtt_filterd_pd,
    x='Group',
    y='mean',
    title='RTT filtered values (Under 0.75)',
    labels={'mean': 'Mean Value', 'Group': 'Malware Status'}
)

# Show the plot
fig.show()

## *Entropy*

### preprocessing

In [None]:
df_ent = df_final['icann_domain', 'entropy', 'malware']
df_ent.show()

In [None]:
print(f"count of TTL after drop nulls: {df_ent.count()}")

In [None]:
print(f"number of null values in normals:   {df_ent.filter(df_ent['entropy'].isNull() & (df_ent['malware'] == 0)).count()}")
print(f"number of null values in malwares:  {df_ent.filter(df_ent['entropy'].isNull() & (df_ent['malware'] == 1)).count()}")

In [52]:
# drop nulls
df_ent = df_ent.dropna()

In [None]:
print(f"count of TTL after drop nulls: {df_ent.count()}")

### unique

In [None]:
df_ent = df_ent.groupBy('icann_domain', 'malware') \
                      .agg(F.sort_array(F.collect_list('entropy')).alias('sorted_entropy'))

# Show the result
df_ent.show()

In [None]:
df_ent.groupBy('icann_domain').count().orderBy(F.col('count').desc()).show()

In [None]:
print(f"number of normal values:  {df_ent.filter(df_ent['malware'] != 1).count()}")
print(f"number of malware values: {df_ent.filter(df_ent['malware'] == 1).count()}")

### new ent dataframe 

In [57]:
df_ent_result = df_ent.withColumn(
    "min", F.expr("array_min(sorted_entropy)")  # Get the minimum TTL
).withColumn(
    "max", F.expr("array_max(sorted_entropy)")  # Get the maximum TTL
).withColumn(
    "mean", F.expr("aggregate(sorted_entropy, 0D, (acc, x) -> acc + x) / size(sorted_entropy)")  # Get the mean of TTLs
)

# Select the desired columns
df_ent_result = df_ent_result.select('icann_domain', 'sorted_entropy', 'min', 'max', 'mean', 'malware')

In [None]:
# show random rows
df_ent_result.sample(fraction=0.2, seed=42).show()

### Ent eda

In [None]:
df_ent_pd = df_ent_result.toPandas()

# Add a "Group" column to distinguish malware categories
df_ent_pd['Group'] = df_ent_pd['malware'].apply(lambda x: 'Malware' if x == 1 else 'Normal')

# Create a boxplot with Plotly Express
fig = px.box(
    df_ent_pd,
    x='Group',
    y='mean',
    title='Distribution of Mean Values by Malware Status',
    labels={'mean': 'Mean Value', 'Group': 'Malware Status'}
)

# Show the plot
fig.show()