<a href="https://colab.research.google.com/github/Venckus/toai_firewall_10mk4/blob/master/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
# %%time
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd

In [2]:
# fastai imports
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# fastai imports (not needed here?)
# from fastai.imports import *
# from fastai.tabular import *

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [4]:
# google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install formatter
pip install nb_black

# Read all files in `data` directory

In [5]:
!pwd

/content


In [6]:
!ls -la 'drive/My Drive/Colab Notebooks/AIacademy/class-C/project/'

total 41
drwx------ 2 root root  4096 Oct  4 18:31 data
-rw------- 1 root root     8 Oct  4 18:31 .gitignore
-rw------- 1 root root 35052 Oct 11 13:59 project.ipynb
-rw------- 1 root root   784 Oct  4 18:31 read_data.py
-rw------- 1 root root   128 Oct  4 18:31 readme.md


In [7]:
# read all data files
mypath = 'drive/My Drive/Colab Notebooks/AIacademy/class-C/project/data'
data_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
print(data_files)

['access.log.7', 'access.log.2', 'access.log.12', 'access.log.9', 'access.log', 'access.log.14', 'access.log.3', 'access.log.8', 'access.log.6', 'access.log.11', 'access.log.10', 'access.log.13', 'access.log.4', 'access.log.1', 'access.log.5']


## Add all files data into one Pandas dataframe

In [8]:
%%time
li = []
for file in data_files:
    tmp = pd.read_csv(f'{mypath}/{file}', sep='\s+', header=None)
    li.append(tmp)
df = pd.concat(li, axis=0, ignore_index=True)

CPU times: user 70.4 ms, sys: 2.93 ms, total: 73.3 ms
Wall time: 3.32 s


# Explore and prepare data

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,45.146.164.186,-,-,[19/Sep/2020:06:38:27,+0200],POST /api/jsonws/invoke HTTP/1.1,404,209,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,184.105.247.194,-,-,[19/Sep/2020:06:48:34,+0200],GET / HTTP/1.1,200,13,-,-
2,45.146.164.179,-,-,[19/Sep/2020:06:57:20,+0200],\x03\x00\x00/*\xE0\x00\x00\x00\x00\x00Cookie: ...,400,182,-,-
3,172.104.108.109,-,-,[19/Sep/2020:07:17:08,+0200],GET / HTTP/1.1,200,396,-,Mozilla/5.0
4,199.244.88.132,-,-,[19/Sep/2020:07:39:56,+0200],GET / HTTP/1.1,200,13,-,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)...


In [None]:
df.shape

(2900, 10)

### remove columns holding useless data '-' and rows with request = nan

In [None]:
df[1].describe()

In [9]:
rm_colls =[1,2,4,8]
df.drop(df.columns[rm_colls], axis=1, inplace=True)

Give names to columns

In [10]:
columns = {0:'IP',3:'date',5:'request',6:'code',7:'body_bytes',9:'browser'}
df.rename(columns=columns,inplace=True)

remove rows with `request` = nan

In [11]:
df = df[df['request'].notna()]

remove '[' from date_time column

In [12]:
df['date'] = df['date'].str[1:]

In [13]:
df.head()

Unnamed: 0,IP,date,request,code,body_bytes,browser
0,45.146.164.186,19/Sep/2020:06:38:27,POST /api/jsonws/invoke HTTP/1.1,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
1,184.105.247.194,19/Sep/2020:06:48:34,GET / HTTP/1.1,200,13,-
2,45.146.164.179,19/Sep/2020:06:57:20,\x03\x00\x00/*\xE0\x00\x00\x00\x00\x00Cookie: ...,400,182,-
3,172.104.108.109,19/Sep/2020:07:17:08,GET / HTTP/1.1,200,396,Mozilla/5.0
4,199.244.88.132,19/Sep/2020:07:39:56,GET / HTTP/1.1,200,13,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)...


# (2) Data labeling

The source of access logs server is where the Python Flask application runs.

## (2.1) Functions used to filter malicious requests using keywords 

In [14]:
def find_bad(tmp_df, phrase):
    'Find rows by keyword phrase using regex'
    new_df = tmp_df[tmp_df.requests.str.contains(phrase, regex=True, na=False)]
    return new_df

In [15]:
def build_phrase(list):
    tmp = ''
    for word in list:
        if tmp == '': tmp = f'{word}'
        else: tmp = f'{tmp}|{word}'
    return tmp

### (2.1.1) all bad requests

In [17]:
bad_requests_phrase = build_phrase([
    'php', 'wp', 'cgi', 'shell',
    'ls+', 'cd\+', 'rm\+', '\-rf', 'wget\+', 'cmod\+', 'chown\+',
    '\?index=','example.com', 'ZINFO', 'fig'
])
bad_requests_phrase

'php|wp|cgi|shell|ls+|cd\\+|rm\\+|\\-rf|wget\\+|cmod\\+|chown\\+|\\?index=|example.com|ZINFO|fig'

In [18]:
bad_requests = df[df.request.str.contains(bad_requests_phrase, regex=True, na=False)]

How many bad requests found in overall dataset?

In [19]:
bad_requests.shape

(1110, 6)

TODO - do we have overlaping results because of few keywords in one request? filter out dublicated rows

## (2.2) Tests on filtering bad requests

### (2.2.1) Find malitious requests accessing *.php files

In [None]:
# php_requests = df.loc[df.request.str.contains('^.php')]
# df.request.str.contains('^.php^')
bad_php = df[df.request.str.contains('php', regex=True, na=False)]

In [None]:
bad_php.shape

(1033, 6)

### (2.2.2) Find malitiuos requests trying to execute shell commands

In [None]:
bad_shell = df[df.request.str.contains('shell|cd\+|rm\+|\-rf|wget\+|chmod\+|chown\+|ls\+', regex=True, na=False)]

In [None]:
bad_shell.shape

(34, 6)

### (2.2.3) Find malitious requests of CGI

In [None]:
bad_cgi = df[df.request.str.contains('cgi', regex=True, na=False)]

In [None]:
bad_cgi.shape

(34, 6)

### (2.2.4) Other malitious

In [None]:
bad_other = df[df.request.str.contains('\?index=|example\.com|ZINFO|fig', regex=True, na=False)]

In [None]:
bad_other.shape

(69, 6)

## (2.3) Good requests

In [20]:
%%time
good_requests = bad_requests.merge(df, how='outer', indicator=True).loc[lambda x : x['_merge']=='right_only']

In [None]:
good_requests.shape

(1754, 7)

In [None]:
good_requests.tail()

Unnamed: 0,IP,date,request,code,body_bytes,browser,_merge
2867,74.120.14.50,22/Sep/2020:05:54:53,GET / HTTP/1.1,200,13,Mozilla/5.0 (compatible; CensysInspect/1.1; +h...,right_only
2868,91.193.5.58,22/Sep/2020:06:10:15,GET / HTTP/1.1,400,182,-,right_only
2869,148.72.132.87,22/Sep/2020:06:11:42,GET / HTTP/1.1,200,13,libwww-perl/6.46,right_only
2870,167.248.133.35,22/Sep/2020:06:22:54,GET / HTTP/1.1,200,612,-,right_only
2871,167.248.133.35,22/Sep/2020:06:22:54,GET / HTTP/1.1,200,396,Mozilla/5.0 (compatible; CensysInspect/1.1; +h...,right_only


# (3) Model

## (3.1) Splitting the date (training / validation)

In [21]:
yBad = [1 for i in range(0, len(bad_requests))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(good_requests))]

In [22]:
df['is_safe'] = df.request.isin(good_requests.request).astype(int) # should it be bool?

In [None]:
df.tail(10)
# df['is_safe'].dtypes

Unnamed: 0,IP,date,request,code,body_bytes,browser,is_safe
2890,45.168.58.218,22/Sep/2020:05:16:19,GET / HTTP/1.1,200,612,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,1
2891,45.146.164.186,22/Sep/2020:05:20:36,GET /?a=fetch&content=<php>die(@md5(HelloThink...,200,396,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0
2892,45.146.164.186,22/Sep/2020:05:22:59,GET /index.php?s=/Index/\x5Cthink\x5Capp/invok...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,0
2893,128.14.134.170,22/Sep/2020:05:47:36,GET /Telerik.Web.UI.WebResource.axd?type=rau H...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,1
2894,74.120.14.50,22/Sep/2020:05:54:52,GET / HTTP/1.1,200,13,-,1
2895,74.120.14.50,22/Sep/2020:05:54:53,GET / HTTP/1.1,200,13,Mozilla/5.0 (compatible; CensysInspect/1.1; +h...,1
2896,91.193.5.58,22/Sep/2020:06:10:15,GET / HTTP/1.1,400,182,-,1
2897,148.72.132.87,22/Sep/2020:06:11:42,GET / HTTP/1.1,200,13,libwww-perl/6.46,1
2898,167.248.133.35,22/Sep/2020:06:22:54,GET / HTTP/1.1,200,612,-,1
2899,167.248.133.35,22/Sep/2020:06:22:54,GET / HTTP/1.1,200,396,Mozilla/5.0 (compatible; CensysInspect/1.1; +h...,1


In [23]:
queries = df['request'].to_list()

In [24]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) #converting data to vectors
X = vectorizer.fit_transform(queries)

In [25]:
y = df['is_safe'].to_list()

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
lgs = LogisticRegression(class_weight={1: 2 * len(good_requests) / len(bad_requests), 0: 1.0})

In [28]:
%%time
lgs.fit(X_train, y_train)

CPU times: user 76.9 ms, sys: 2.39 ms, total: 79.3 ms
Wall time: 83.1 ms


LogisticRegression(C=1.0, class_weight={0: 1.0, 1: 3.1603603603603605},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

# Evaluation

In [29]:
predicted = lgs.predict(X_test)

In [30]:
fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

In [31]:
print(len(df.IP))
print(len(bad_requests))
print("Baseline Constant negative: %.6f" % (len(df.IP) / (len(df.IP) + len(bad_requests))))

2864
1110
Baseline Constant negative: 0.720684


In [32]:
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Accuracy: 0.989529
Precision: 0.985836
Recall: 0.997135
F1-Score: 0.991453
AUC: 0.999859


In [38]:
predicted

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,