# Building a Model to Detect Randomly Generated Domains

In [None]:
# Import Needed Modules
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import math
import re

In [None]:
# Load Helper Functions for feature creations
def get_tld(domain):
    parts = domain.split(".")
    lastIndex = len(parts)-1
    tld = parts[lastIndex]
    secondLD = parts[lastIndex-1]
    if len(parts[lastIndex]) < 3 and len(secondLD) < 4:
        tld = parts[lastIndex-1] + "." + tld
    return(tld)


def get_2ld(domain):
    parts = domain.split(".")
    num_parts = len(parts)
    lastIndex = len(parts)-1
    tld = parts[lastIndex]
    secondLD = parts[lastIndex-1]
    secondIndex = lastIndex-1
    if len(parts[lastIndex]) < 3 and len(secondLD) < 4 and num_parts > 2:
        secondIndex = lastIndex - 2
    # print(domain + " " + str(secondIndex))
    return(parts[secondIndex])


def get_3ld(domain):
    parts = domain.split(".")
    if len(parts) < 3:
        return ""
    num_parts = len(parts)
    lastIndex = len(parts)-1
    tld = parts[lastIndex]
    secondLD = parts[lastIndex-1]
    secondIndex = lastIndex-1
    if len(tld) < 3 and len(secondLD) < 4 and num_parts > 2:
        secondIndex = lastIndex - 2
    return(parts[secondIndex-1])


def num_parts(domain):
    return(len(domain.split(".")))


def distinct_char(domain):
    return(len(set(domain)))


def count_digits(domain):
    return(sum(c.isdigit() for c in domain))


def count_dashes(domain):
    return(sum(c == "-" for c in domain))


def shannonEntropy(domain):
    str_len = len(domain)
    unique_chars = set(domain)
    entropy = 0

    for u in unique_chars:
        count = domain.count(u)
        fraction = count * 1.0 / str_len
        entContrib = (fraction * np.log2(fraction))
        entropy = entropy + entContrib
    return(entropy * -1)


def metricEntropy(domain):
    strLength = len(domain)
    return(shannonEntropy(domain)/strLength)

In [None]:
# Import Data
datafile = "data/training-data-raw.parquet"
df = pd.read_parquet(datafile)

In [None]:
df.head()

In [None]:
# List of Features
# Create the following 23 features
# 
# 1 Number of Domain Parts
# 2 Length of tld
# 3 Length of 2LD
# 4 Length of 3LD
# 5 Has a 3LD
# 6 Has more than 3LD
# 7 Len tld < 3 (ends in just a country code)
# 8 Is .edu
# 9 Is .gov
# 10 Is .com
# 11 Is .net
# 12 Is .org
# 13 Is .info
# 14 Is .biz
# 15 Distinct Char
# 16 DigitCount
# 17 Has Digit
# 18 Num Dashes
# 19 Has Dash
# 20 Length of anything past 3LD
# 21 Percent Distinct
# 22 Percent Digits
# 23 Metric Entropy

In [None]:
df['num_parts'] = df['domain'].apply(lambda x: num_parts(x))
df['len_tld'] = df['domain'].apply(lambda x: len(get_tld(x)))
df['len_2ld'] = df['domain'].apply(lambda x: len(get_2ld(x)))
df['len_3ld'] = df['domain'].apply(lambda x: len(get_3ld(x)))
df = df.assign(has_3ld=(df.len_3ld > 0)*1)
df = df.assign(more_than_3ld=(df.num_parts > 3)*1)
df = df.assign(two_letter_tld=(df.len_tld < 3)*1)
df = df.assign(is_edu=(df.domain.str.contains(".edu")*1))
df = df.assign(is_gov=(df.domain.str.contains(".gov") | df.domain.str.contains(
    ".govt") | df.domain.str.contains(".gouv"))*1)
df = df.assign(is_com=(df.domain.str.contains(".com")*1))
df = df.assign(is_net=(df.domain.str.contains(".net")*1))
df = df.assign(is_org=(df.domain.str.contains(".org")*1))
df = df.assign(is_info=(df.domain.str.contains(".info")*1))
df = df.assign(is_biz=(df.domain.str.contains(".biz")*1))
df['distinct_char'] = df['domain'].apply(lambda x: distinct_char(x))
df['digit_count'] = df['domain'].apply(lambda x: count_digits(x))
df = df.assign(has_digit=(df.digit_count > 0)*1)
df = df.assign(num_dashes=(df.domain.str.count('-')))
df = df.assign(has_dash=(df.domain.str.contains("-")*1))
df['length_extra'] = np.where(df['more_than_3ld'] == 1, df.domain.str.len(
) - df.len_tld - df.len_2ld - df.len_3ld - 3, 0)
df = df.assign(percent_distinct=df.distinct_char / df.domain.str.len())
df = df.assign(percent_digits=df.digit_count / df.domain.str.len())
df['entropy'] = df['domain'].apply(lambda x: metricEntropy(x))

In [None]:
df.head(20)

In [None]:
# Select columns for input
columnList = ['num_parts', 'len_tld', 'len_2ld', 'len_3ld', 'has_3ld', 'more_than_3ld',
              'two_letter_tld', 'is_edu', 'is_gov', 'is_com', 'is_net', 'is_org', 'is_info',
              'is_biz', 'distinct_char', 'digit_count', 'has_digit', 'num_dashes', 'has_dash',
              'length_extra', 'percent_distinct', 'percent_digits', 'entropy', 'ttl']


In [None]:
baselineCount = df.groupby('label').count()["source"]
print(baselineCount)

In [None]:
baselineCount[1]/baselineCount[0]

In [None]:
baselineCount[1]*1.0/baselineCount[0]

In [None]:
# model = linear_model.LogisticRegressionCV(Cs=20, penalty='l2')
model = RandomForestClassifier(
    n_estimators=5, min_samples_leaf=1, max_depth=20) ## Notice the use of hyperparameters here

In [None]:
from sklearn import metrics
acc = cross_val_score(
    model, df[columnList], df.label, cv=10, scoring='accuracy')
# scores = cross_val_score(model, df[columnList], df.label, cv=10, scoring='roc_auc')
print("Acc:" + str(acc))
print("Overall:" + str(np.mean(acc)))

In [None]:
auc = cross_val_score(model, df[columnList], df.label, cv=10, scoring='roc_auc')
print("AUC:" + str(auc))
print("Overall:" + str(np.mean(auc)))