# Libraries to Import

In [94]:
from urllib.parse import unquote
import base64
import csv
import time
import xml.etree.ElementTree as ET
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

sns.set_style("darkgrid")

DEBUG = True
SEED = 343

In [95]:
import os

for d in ['orig','data','output']:
    if not os.path.isdir(d): os.makedirs(d, mode=0, exist_ok=True)

In [96]:
# Parse logs into readable format
class LogParse:
    def __init__(self):
        pass
    def parse_log(self, log_path):
        result = {}
        try:
            with open(log_path):
                pass
        except IOError:
            print("Error, file not found!")
            exit()
        try:
            tree = ET.parse(log_path)
        except Exception:
            print("Error")
            exit()
        root = tree.getroot()

        for reqs in root.findall('item'):
            raw_req = reqs.find('request').text
            raw_req = unquote(raw_req)
            raw_response = reqs.find('response').text
            result[raw_req] = raw_response
        return result

    def parseRawHTTPReq(self, rawreq):
        try:
            raw = rawreq.decode('utf-8')
        except Exception:
            raw = rawreq
        global headers, method, body, path
        headers = {}
        sep = raw.split('\r\n\r', 1)
        if sep[1] != "":
            head = sep[0]
            body = sep[1]
        else:
            head = sep[0]
            body = ""
        count1 = head.split('\n', head.count('\n'))
        method = count1[0].split(' ', 2)[0]
        path = count1[0].split(' ', 2)[1]
        for i in range(1, head.count('\n') + 1):
            slice1 = count1[i].split(': ', 1)
            if slice1[0] != "":
                try:
                    headers[slice1[0]] = slice1[1]
                except:
                    pass
        return headers, method, body, path

# Define words associated with malicious attacks
malwords = ['SLEEP', 'DROP', 'UNION', 'SELECT', 'WAITFOR', 'DELAY', 'ORDER BY', 'GROUP BY', 'DELETE','OR', 'AND', 'WHERE']

# Extract features from parsed logs
def ExtractFeatures(method, path, body, headers, class_flag):
    malwords_count = 0
    single_q = path.count("'") + body.count("'")
    double_q = path.count("\"") + body.count("\"")
    dashes = path.count("--") + body.count("--")
    braces = path.count("(") + body.count("(")
    slash = path.count("/") + body.count("/")
    dots = path.count(".") + body.count(".")
    asterik = path.count("*") + body.count("*")
    for word in malwords:
        malwords_count += path.count(word) + body.count(word)
    for word in headers:
        malwords_count += path.count(word) + body.count(word)
    for header in headers:
        malwords_count += headers[header].count(word) + headers[header].count(word)

    return [method, path.strip(), body.strip(), single_q, double_q, dashes, braces, slash,dots, asterik, malwords_count, class_flag]

In [97]:
# Function to 
def write_to_file(filename, log_path, class_flag):
    f = open(filename, 'w')
    c = csv.writer(f)
    c.writerow(["Method", "Path", "Body", "Single_Q", "Double_Q", "Dashes", "Braces", "Slashes", "Dots", "Asterik",  "Bad_Words", 'target'])
    f.close()
    lp = LogParse()
    result = lp.parse_log(log_path)
    f = open(filename, "a")
    c = csv.writer(f, lineterminator='\n')
    if result is not None:
        for items in result:
            raaw = base64.b64decode(items)
            headers, method, body, path = lp.parseRawHTTPReq(raaw)
            result = ExtractFeatures(method, path, body, headers, class_flag)
            c.writerow(result)
        f.close()
    else:
        print("File cannot be converted!")

In [98]:
# Parsing and converting both good and bad log files
timestr = time.strftime("%Y%m%d-%H%M%S")
extension = ".csv"

# Naming the csv files based on the current time
good_csv_filename = 'good-' + timestr + extension
bad_csv_filename = 'bad-' + timestr + extension

# Log files to be used
good_logs = 'good_logs'
bad_logs = 'burp_bad_traffic_new'

# Class labels to add to the log files
good_class_flag = 0
bad_class_flag = 1

# Write parsed logs to CSV files
write_to_file(good_csv_filename, good_logs, good_class_flag)
write_to_file(bad_csv_filename, bad_logs, bad_class_flag)

In [99]:
# Read and convert csv files to dataframes
good_data = pd.read_csv(good_csv_filename)
bad_data = pd.read_csv(bad_csv_filename)

# Merging both datasets into one
data = pd.concat([good_data, bad_data], axis=0)
print(data.shape)
data

(3093, 12)


Unnamed: 0,Method,Path,Body,Single_Q,Double_Q,Dashes,Braces,Slashes,Dots,Asterik,Bad_Words,target
0,GET,/6195b252_3973263ec9bb3e4b938c5c86e3e974cdd8e2...,,0,0,0,0,6,2,0,0,0
1,GET,/v1/jwplayer6/ping.gif?h=2100168025&e=t&n=8063...,,0,0,0,0,3,9,0,0,0
2,GET,/6195b252_3973263ec9bb3e4b938c5c86e3e974cdd8e2...,,0,0,0,0,6,2,0,0,0
3,GET,/6195b252_3973263ec9bb3e4b938c5c86e3e974cdd8e2...,,0,0,0,0,6,2,0,0,0
4,GET,/v1/jwplayer6/ping.gif?h=-1588266826&e=ret&n=1...,,0,0,0,0,3,9,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1066,GET,/'%22%3e%3csvg/onload%3dfetch%60//qjd4gbcdu3ud...,,1,0,0,0,5,3,0,0,1
1067,GET,/swagger/javascript%3a/*%3c/script%3e%3cimg/on...,,2,0,0,4,20,5,2,0,1
1068,GET,/javascript%3a/*%3c/script%3e%3cimg/onerror%3d...,,2,0,0,4,20,6,2,0,1
1069,GET,/swagger/'%22%3e%3csvg/onload%3dfetch%60//4z4i...,,1,0,0,0,5,2,0,0,1


In [100]:
# Save data to original data folder for later use
data.to_csv("orig/data.csv", index=False)

In [101]:
# Split of good/bad data
data.target.value_counts(normalize=0)

0    2022
1    1071
Name: target, dtype: int64

In [102]:
# Defining the features and target
target = 'target'
features = data.columns.to_list()
features.remove(target)

## Data Cleaning

In [103]:
data.dtypes

Method       object
Path         object
Body         object
Single_Q      int64
Double_Q      int64
Dashes        int64
Braces        int64
Slashes       int64
Dots          int64
Asterik       int64
Bad_Words     int64
target        int64
dtype: object

In [104]:
# Displaying the missing values
print(f"Shape {data.shape}")

for c in data.columns:
    if data[c].isna().sum() > 0:
        print(f"Missing Values for column - {c}:", data[c].isna().sum())

Shape (3093, 12)
Missing Values for column - Body: 3028


In [105]:
# Missing data is converted to an empty string
data['Body'] = data.Body.fillna("")
print(f'Missing values {data.isna().sum().sum()}')

Missing values 0


In [106]:
# Dropping duplicate rows
data = data.drop_duplicates(keep='first')
data.shape

(2569, 12)

In [107]:
# Dropping conflicting cases (cases with identical features but differing class labels)
data = data.drop_duplicates(subset=features, keep=False)
data.shape

(2489, 12)

In [108]:
# Searching for and dropping columns with only one unique value.
for c in data.columns:
    if data[c].value_counts().count() == 1:
        print(f"Dropped column - {c} as it only has one unique value")
        data.drop(columns=c, inplace=True)
data.shape

(2489, 12)

In [109]:
data.target.value_counts()

0    1934
1     555
Name: target, dtype: int64

# Save cleaned data to file

In [110]:
data.to_csv("data/cleaned_data.csv", index=False)