# Classification


In [None]:
import pandas as pd
import numpy as np

PROCESSED_AGD_DATASET = pd.read_csv('PROCESSED_AGD_DATASET.csv')
PROCESSED_BENIGN_DATASET = pd.read_csv('PROCESSED_BENIGN_DATASET.csv')

AGD = ['banjori','dnschanger','fobber','kraken','monerodownloader','murofret','necurs','newgoz','nymaim','padcrypt','pushdo','pyskpa','qadars','ranbyus','shiotob','simda','symmi','zloader']

DATAFRAME_AGD = {}
BALANCED_DATAFRAME_AGD = {}
BALANCED_DATAFRAME_BENIGN = None

FINAL_DATAFRAME_AGD = None
FINAL_DATAFRAME_BENIGN = None

FINAL_DATAFRAME = None

FEATURE_LABELS = None

In [None]:
# !apt install enchant
# !pip install pyenchant

In [None]:
from sklearn.utils import resample

# separate observations from each class into different DataFrames

for x in AGD:
  DATAFRAME_AGD[x] = PROCESSED_AGD_DATASET[PROCESSED_AGD_DATASET.agd_type ==x]

In [None]:
# balance the AGD's
y = 1000

for x in AGD:
  try:
    BALANCED_DATAFRAME_AGD[x] = resample(DATAFRAME_AGD[x],replace=False,n_samples=y)
  except:
    BALANCED_DATAFRAME_AGD[x] = resample(DATAFRAME_AGD[x],replace=True,n_samples=y)


# balance the benign
BALANCED_DATAFRAME_BENIGN = resample(PROCESSED_BENIGN_DATASET,replace=False,n_samples= len(AGD)*y)

In [None]:
 # AGD DATAFRAME CONCATED
 FINAL_DATAFRAME_AGD = pd.concat( list(BALANCED_DATAFRAME_AGD.values()) )
 FINAL_DATAFRAME_BENIGN = BALANCED_DATAFRAME_BENIGN

In [None]:
FINAL_DATAFRAME = pd.concat([FINAL_DATAFRAME_AGD,FINAL_DATAFRAME_BENIGN])

In [None]:
LABEL_ARRAY = np.array(FINAL_DATAFRAME.domain_class)
FEATURE_ARRAY = np.asarray(FINAL_DATAFRAME.iloc[:,2:].values)

In [None]:
from sklearn.model_selection import train_test_split
TRAIN_FEATURES,TEST_FEATURES,TRAIN_LABELS,TEST_LABELS = train_test_split(FEATURE_ARRAY,LABEL_ARRAY,test_size = 0.2,random_state = 42)

print('Training Features Shape:', TRAIN_FEATURES.shape)
print('Training Labels Shape:', TRAIN_LABELS.shape)
print('Testing Features Shape:', TEST_FEATURES.shape)
print('Testing Labels Shape:', TEST_LABELS.shape)

Training Features Shape: (28800, 16)
Training Labels Shape: (28800,)
Testing Features Shape: (7200, 16)
Testing Labels Shape: (7200,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 2500, random_state = 42)
classifier.fit(TRAIN_FEATURES,TRAIN_LABELS)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
FEATURE_LABELS = list(FINAL_DATAFRAME_AGD.columns)[2:]
for feature in zip(FEATURE_LABELS, classifier.feature_importances_):
    print(feature)

('domain_len', 0.13262987525515665)
('vowel_count_to_domain_length', 0.014816314838060467)
('digit_count_to_domain_length', 0.0038526989145963755)
('repeat_character_count_to_domain_length', 0.024813634099783762)
('shanon_entropy', 0.0896919470351042)
('number_of_meaningfull_substring_to_domain_length', 0.0407084100332614)
('longest_meaningfull_substring_to_domain_length', 0.019512802656065174)
('consecutive_digit_ratio', 0.14050869011665063)
('entropy_relative_to_en_words', 0.012791155491739056)
('entropy_relative_to_en_domain', 0.018302858873758827)
('bi_gram_score_relative_to_en_domains', 0.11252048918180244)
('tri_gram_score_relative_to_en_domains', 0.1263748684232117)
('quad_gram_score_relative_to_en_domain', 0.1434532785279071)
('bi_gram_score_relative_to_en_words', 0.05268663871032726)
('tri_gram_score_relative_to_en_words', 0.041383328278917116)
('quad_gram_score_relative_to_en_words', 0.02595300956365801)


In [None]:
pred = classifier.predict(TEST_FEATURES)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(TEST_LABELS, pred)

0.97875

AUC is desirable for the following two reasons:

AUC is scale-invariant. It measures how well predictions are ranked, rather than their absolute values.
AUC is classification-threshold-invariant. It measures the quality of the model's predictions irrespective of what classification threshold is chosen.

In [None]:
from sklearn.metrics import roc_auc_score
prob_y_2 = classifier.predict_proba(TEST_FEATURES)
prob_y_2 = [p[1] for p in prob_y_2]
print("AUC score :", roc_auc_score(TEST_LABELS, prob_y_2) )

AUC score : 0.9977988038425003


In [None]:
# PRINT A TREE FROM CLASSIFIER

# Import tools needed for visualization

from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest

tree = classifier.estimators_[5]

# Export the image to a dot file

export_graphviz(tree, out_file = 'tree.dot', feature_names = FEATURE_LABELS, rounded = True, precision = 1)

# Use dot file to create a graph

(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png')

# API

In [None]:
# # # !mv *.csv ./DATASET/AGD/
# !pip install pyenchant
# !apt install enchant

In [None]:
# FILE 1 : GLOBAL_VARIABLES
from collections import Counter

CLEANED_AGD_DATASET = None
CLEANED_BENIGN_DATASET = None

PROCESSED_AGD_DATASET = None
PROCESSED_BENIGN_DATASET = None

DATASET_LOCATION = '/content/DATASET'

PARSED_DOMAIN = {}

CHARACTER_IN_DOMAIN = {}

CHARACTER_DISTRIBUTION_ENGLISH_WORDS = Counter({'total':0})
CHARACTER_DISTRIBUTION_ENGLISH_DOMAINS = Counter({'total':0})

DOMAIN_BIGRAM_DISTRIBUTION = Counter({'total':0})
DOMAIN_TRIGRAM_DISTRIBUTION = Counter({'total':0})
DOMAIN_QUADGRAM_DISTRIBUTION = Counter({'total':0})

ENGLISH_WORD_BIGRAM_DISTRIBUTION = Counter({'total':0})
ENGLISH_WORD_TRIGRAM_DISTRIBUTION = Counter({'total':0})
ENGLISH_WORD_QUADGRAM_DISTRIBUTION = Counter({'total':0})

DOMAIN_MEANINGFULL_STRING = {}
VOWELS = ['a','e','i','o','u']

FEATURES = {}

DATASETS = {'AGD':{},'BENIGN':{},'EN_WORDS':{},'TEST':{}}

In [None]:
# FILE 2 : DOMAIN PARSING

# Parse a domain name
def parse_domain(domain):
    domain_splitted = domain.split('.')
    domain_level = {}
    i = 1
    while domain_splitted:
        domain_level[i] = domain_splitted.pop().lower()
        i += 1
    return domain_level

def get_parsed_domain(domain):
  if domain not in PARSED_DOMAIN:
    PARSED_DOMAIN[domain]= parse_domain(domain)
  return PARSED_DOMAIN[domain]

def get_top_level_domain(domain):
  return get_parsed_domain(domain)[1]

def get_second_level_domain(domain):
  return get_parsed_domain(domain)[2]

def is_parsed(domain):
  return domain in PARSED_DOMAIN

# Test
# print(get_parsed_domain('www.fb.gh.com'))
# print(get_parsed_domain('www.lg.mt.trtsfd.in'))
# print(get_top_level_domain('www.fb.gh.com'))
# print(get_second_level_domain('www.fb.gh.com'))

In [None]:
# FILE 3 : MAIN FUNCTIONS

import pandas as pd
import enchant
dictionary = enchant.Dict("en_US")

# CHARACTER COUNT OF A DOMAIN
def characters_in_domain(domain):
  if domain not in CHARACTER_IN_DOMAIN:
    CHARACTER_IN_DOMAIN[domain] = Counter(domain)
  return CHARACTER_IN_DOMAIN[domain]

# CREATE A DISTRIBUTION

def add_to_distribution(domain,distribution):
  distribution += CHARACTER_IN_DOMAIN[domain]
  distribution['total'] += sum([y for x,y in dict(CHARACTER_IN_DOMAIN[domain]).items()])

# LIST N-GRAM OF STRING
def n_gram(string, n):
    string_len = len(string)
    return [string[i:i+n] for i in range(string_len) if i+n <= string_len ]

# ADD STRING'S N-GRAM TO DISTRIBUTION
def add_grams(string,isDomain):
  global DOMAIN_BIGRAM_DISTRIBUTION,DOMAIN_TRIGRAM_DISTRIBUTION,DOMAIN_QUADGRAM_DISTRIBUTION,ENGLISH_WORD_BIGRAM_DISTRIBUTION,ENGLISH_WORD_TRIGRAM_DISTRIBUTION,ENGLISH_WORD_QUADGRAM_DISTRIBUTION
  if isDomain:
    DOMAIN_BIGRAM_DISTRIBUTION += Counter(n_gram(string, 2))
    DOMAIN_TRIGRAM_DISTRIBUTION += Counter(n_gram(string, 3))
    DOMAIN_QUADGRAM_DISTRIBUTION += Counter(n_gram(string, 4))

  if not isDomain:
    ENGLISH_WORD_BIGRAM_DISTRIBUTION += Counter(n_gram(string, 2))
    ENGLISH_WORD_TRIGRAM_DISTRIBUTION += Counter(n_gram(string, 3))
    ENGLISH_WORD_QUADGRAM_DISTRIBUTION += Counter(n_gram(string, 4))

# LIST ALL SUB-STRING OF A STRING
def generate_sub_strings(string):
  y = []
  string_len = len(string)
  for n in range(3,string_len):
    y.extend([string[i:i+n] for i in range(string_len) if i+n <= string_len])
  return y

# CHECK IF MEANING-FULL
def isMeaningfull(string):
  return dictionary.check(string)

# EXTRACT FEATURE FROM A DOMAIN
def extract_features(domain):
  return [x(domain) for y,x in FEATURES.items()]

# ADD ADDITIONAL LABEL TO FEATURE'S DATAFRAME
def label_feature(features,domain_class='benign',agd_type='benign'):
  return [domain_class,agd_type]+features

# SAVE A DATA FRAME TO DIRECTORY
def save_dataframe(dataframe,name):
  dataframe.to_csv('.'.join([name,'csv']),index=False)

# GENERATE ALL FEATURES FOR THE DATASET
def generate_features(dataset):
    _ = []
    for index, row in dataset.iterrows():
      _.append(label_feature(extract_features(row['domain']),row['domain_class'],row['agd_type']))
    return pd.DataFrame( _, columns=['domain_class','agd_type']+list(FEATURES.keys()) )


# DIVIDE A VALUE BY LENGTH OF DOMAIN
def divide_by_domain_len(value,domain):
  return value/len(domain)
#
def relative_entropy(domain,distribution):
  x = symbol_probability(characters_in_domain(domain))
  try:
    return sum([  -1*y*log2(y/(distribution[i]/distribution['total']))  for i,y in x.items() ])
  except:
    print([(i,distribution[i],distribution['total'])  for i,y in x.items() ])
    return 0

#
def symbol_probability(freq):
  total_characters = sum(freq.values())
  return {x:freq[x]/total_characters for x in freq}

In [None]:
# FILE 4 : FEATURES

from math import log2

def domain_len(domain):
  x = characters_in_domain(domain)
  return sum(x.values())

def vowel_count_to_domain_length(domain):
  vowels = sum([y for x,y in dict(characters_in_domain(domain)).items() if x in VOWELS ])
  domain_len = len(domain)
  return vowels/domain_len

def digit_count_to_domain_length(domain):
  digits = sum([y for x,y in dict(characters_in_domain(domain)).items() if x.isdigit() ])
  domain_len = len(domain)
  return digits/domain_len

def repeat_character_count_to_domain_length(domain):
  repeated = sum([y for x,y in dict(characters_in_domain(domain)).items() if y > 1 ])
  domain_len = len(domain)
  return repeated/domain_len

def shanon_entropy(domain):
  x = symbol_probability(characters_in_domain(domain))
  return sum([ -1*y*log2(y) for y in x.values() ])


def number_of_meaningfull_substring_to_domain_length(domain):
  if domain not in DOMAIN_MEANINGFULL_STRING:
    DOMAIN_MEANINGFULL_STRING[domain] = [ x for x in generate_sub_strings(domain) if isMeaningfull(x)]
  return len(DOMAIN_MEANINGFULL_STRING[domain])/len(domain)

def longest_meaningfull_substring_to_domain_length(domain):
  if domain not in DOMAIN_MEANINGFULL_STRING:
    DOMAIN_MEANINGFULL_STRING[domain] = [ x for x in generate_sub_strings(domain) if isMeaningfull(x)]
  return max(list(map(len,DOMAIN_MEANINGFULL_STRING[domain]+[''])))

def entropy_relative_to_en_words(domain):
  return relative_entropy(domain,CHARACTER_DISTRIBUTION_ENGLISH_WORDS)

def entropy_relative_to_en_domain(domain):
  return relative_entropy(domain,CHARACTER_DISTRIBUTION_ENGLISH_DOMAINS)

def bi_gram_score_relative_to_en_domains(domain):
  x = Counter(n_gram(domain,2))
  return divide_by_domain_len(sum( [x[c]*DOMAIN_BIGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def tri_gram_score_relative_to_en_domains(domain):
  x = Counter(n_gram(domain,3))
  return divide_by_domain_len(sum( [x[c]*DOMAIN_TRIGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def quad_gram_score_relative_to_en_domain(domain):
  x = Counter(n_gram(domain,4))
  return divide_by_domain_len(sum( [x[c]*DOMAIN_QUADGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def bi_gram_score_relative_to_en_words(domain):
  x = Counter(n_gram(domain,2))
  return divide_by_domain_len(sum( [x[c]*ENGLISH_WORD_BIGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def tri_gram_score_relative_to_en_words(domain):
  x = Counter(n_gram(domain,3))
  return divide_by_domain_len(sum( [x[c]*ENGLISH_WORD_TRIGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def quad_gram_score_relative_to_en_words(domain):
  x = Counter(n_gram(domain,4))
  return divide_by_domain_len(sum( [x[c]*ENGLISH_WORD_QUADGRAM_DISTRIBUTION[c]  for c in x] ),domain)

def consecutive_digit_ratio(domain):
  x = Counter(n_gram('aaaa',2))
  return divide_by_domain_len(sum([y for g,y in x.items() if g[0]==g[1]]),domain)


In [None]:
# ADDING FEATURES
FEATURES['domain_len'] = domain_len
FEATURES['vowel_count_to_domain_length'] = vowel_count_to_domain_length
FEATURES['digit_count_to_domain_length'] = digit_count_to_domain_length
FEATURES['repeat_character_count_to_domain_length'] = repeat_character_count_to_domain_length

FEATURES['shanon_entropy'] = shanon_entropy

FEATURES['number_of_meaningfull_substring_to_domain_length'] = number_of_meaningfull_substring_to_domain_length
FEATURES['longest_meaningfull_substring_to_domain_length'] = longest_meaningfull_substring_to_domain_length

FEATURES['consecutive_digit_ratio']=consecutive_digit_ratio

FEATURES['entropy_relative_to_en_words']=entropy_relative_to_en_words
FEATURES['entropy_relative_to_en_domain']=entropy_relative_to_en_domain

FEATURES['bi_gram_score_relative_to_en_domains'] = bi_gram_score_relative_to_en_domains
FEATURES['tri_gram_score_relative_to_en_domains'] = tri_gram_score_relative_to_en_domains
FEATURES['quad_gram_score_relative_to_en_domain'] = quad_gram_score_relative_to_en_domain

FEATURES['bi_gram_score_relative_to_en_words'] = bi_gram_score_relative_to_en_words
FEATURES['tri_gram_score_relative_to_en_words'] = tri_gram_score_relative_to_en_words
FEATURES['quad_gram_score_relative_to_en_words'] = quad_gram_score_relative_to_en_words

In [None]:
# FILE 5: LOAD DATASETS
import os
import pandas as pd

def find_and_load_dataset():
  def is_hidden(file):
    return True if file[0] == '.' else False

  def load_dataset(directory,class_of):
    with os.scandir(directory) as listing:
      for file in listing:
        if file.is_file():
          if not is_hidden(file.name):
            DATASETS[class_of][file.name.split('.')[0]] =  pd.read_csv(os.path.join(directory,file.name))[:50000]

  with os.scandir(DATASET_LOCATION) as listing:
    for file in listing:
      if file.is_dir():
        if not is_hidden(file.name):
          load_dataset(os.path.join(DATASET_LOCATION,file.name),file.name)

def clean_data(x):
  _ = []
  for dataset in DATASETS[x]:
    DATASETS[x][dataset].columns = ["domain"]
    for i,c in DATASETS[x][dataset].iterrows():
      if not is_parsed(c['domain']):
        try:
          get_parsed_domain(c['domain'])
          characters_in_domain(get_second_level_domain(c['domain']))
          _.append([get_second_level_domain(c['domain']),x , dataset])
        except:
          print(c['domain'])
  return pd.DataFrame( _, columns=['domain','domain_class','agd_type'])

def clean_words(x):
  _ = []
  for dataset in DATASETS[x]:
    for i,c in DATASETS[x][dataset].iterrows():
      if len(c['word']) >= 4:
        characters_in_domain(c['word'])
        _.append(c['word'])

  return pd.DataFrame( _, columns=['word'])

In [None]:
find_and_load_dataset()

In [None]:
CLEANED_BENIGN_DATASET = clean_data('BENIGN')
CLEANED_WORD_DATASET = clean_words('EN_WORDS')

In [None]:
def pre_processing():
  for index, row in CLEANED_BENIGN_DATASET.iterrows():
    add_to_distribution(row['domain'],CHARACTER_DISTRIBUTION_ENGLISH_DOMAINS)
    add_grams(row['domain'],True)

  for index,row in CLEANED_WORD_DATASET.iterrows():
    add_to_distribution(row['word'],CHARACTER_DISTRIBUTION_ENGLISH_WORDS)
    add_grams(row['word'],False)

In [None]:
pre_processing()

In [None]:
for x,y in CHARACTER_DISTRIBUTION_ENGLISH_DOMAINS.items():
  if not x.isalpha():
    CHARACTER_DISTRIBUTION_ENGLISH_WORDS[x] = CHARACTER_DISTRIBUTION_ENGLISH_DOMAINS[x]

In [None]:
classifier.predict_proba(pd.DataFrame(extract_features('facebook')).values.reshape(1,16) )

array([[0.0024, 0.9976]])

In [None]:
classifier.predict_proba(pd.DataFrame(extract_features('sdasdfafg')).values.reshape(1,16) )

array([[0.9632, 0.0368]])

In [None]:
classifier.predict_proba(pd.DataFrame(extract_features('vitalpointz')).values.reshape(1,16) )

array([[0.0048, 0.9952]])

In [None]:
classifier.predict_proba(pd.DataFrame(extract_features('happy923eguyfdgk4life')).values.reshape(1,16) )

array([[0.726, 0.274]])

In [None]:
print('Alive')

Alive
