In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import itertools

import env

In [2]:
# acquire the data from sql using my credentials
url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/logs'
df = pd.read_sql('SELECT * FROM api_access', url)


# function to deal with parsing one entry in our log data
def parse_log_entry(entry):
    parts = entry.split()
    output = {}
    output['ip'] = parts[0]
    output['timestamp'] = parts[3][1:].replace(':', ' ', 1)
    output['request_method'] = parts[5][1:]
    output['request_path'] = parts[6]
    output['http_version'] = parts[7][:-1]
    output['status_code'] = parts[8]
    output['size'] = int(parts[9])
    output['user_agent'] = ' '.join(parts[11:]).replace('"', '')
    return pd.Series(output)

# df = pd.concat([df.entry, df.entry.apply(parse_log_entry)], axis=1)
df = df.entry.apply(parse_log_entry)
df.head()

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
0,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
1,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2,97.105.19.58,16/Apr/2019 19:34:44,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
3,97.105.19.58,16/Apr/2019 19:34:46,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
4,97.105.19.58,16/Apr/2019 19:34:48,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0


In [3]:
df.head()

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
0,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
1,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2,97.105.19.58,16/Apr/2019 19:34:44,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
3,97.105.19.58,16/Apr/2019 19:34:46,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
4,97.105.19.58,16/Apr/2019 19:34:48,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0


Use basic probability to identify anomalous requests. Using the methods covered in this lesson, examine the rest of the features in the api access logs data set.

In [4]:
# Synthetic data
new = pd.DataFrame([
    ["95.31.18.119", "21/Apr/2019 10:02:41", "GET", "/api/v1/items/", "HTTP/1.1", '200', 1153005, "python-requests/2.21.0"],
    ["95.31.16.121", "17/Apr/2019 19:36:41", "GET", "/api/v1/sales?page=79/", "HTTP/1.1", '301', 1005, "python-requests/2.21.0"],
    ["97.105.15.120", "18/Apr/2019 19:42:41", "GET", "/api/v1/sales?page=79/", "HTTP/1.1", '301', 2560, "python-requests/2.21.0"],
    ["97.105.19.58", "19/Apr/2019 19:42:41", "GET", "/api/v1/sales?page=79/", "HTTP/1.1", '200', 2056327, "python-requests/2.21.0"],
], columns=df.columns)

df = df.append(new)
df.head()

Unnamed: 0,ip,timestamp,request_method,request_path,http_version,status_code,size,user_agent
0,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/sales?page=81,HTTP/1.1,200,512495,python-requests/2.21.0
1,97.105.19.58,16/Apr/2019 19:34:42,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0
2,97.105.19.58,16/Apr/2019 19:34:44,GET,/api/v1/sales?page=82,HTTP/1.1,200,510103,python-requests/2.21.0
3,97.105.19.58,16/Apr/2019 19:34:46,GET,/api/v1/sales?page=83,HTTP/1.1,200,510003,python-requests/2.21.0
4,97.105.19.58,16/Apr/2019 19:34:48,GET,/api/v1/sales?page=84,HTTP/1.1,200,511963,python-requests/2.21.0


In [5]:
# change the df to a datetime index
df['size_mb'] = df['size'] / 1024 / 1024
df.timestamp = pd.to_datetime(df.timestamp)
df = df.set_index('timestamp')

In [6]:
# detect anomalies start by gettting the count
df.ip.value_counts()

97.105.19.58      11999
173.173.113.51     1059
72.181.113.170      613
72.181.105.81       246
68.201.219.223       21
24.26.242.9          21
35.175.171.137        2
52.87.230.102         2
70.121.214.34         2
95.31.16.121          1
45.23.250.16          1
3.88.129.158          1
34.207.64.242         1
54.172.14.223         1
52.90.165.200         1
97.105.15.120         1
52.91.30.150          1
54.145.52.184         1
35.174.209.2          1
95.31.18.119          1
3.92.201.136          1
34.229.70.250         1
Name: ip, dtype: int64

In [7]:
# next find the frequency in the form of a percentage
df.ip.value_counts(normalize=True)

97.105.19.58      0.858420
173.173.113.51    0.075762
72.181.113.170    0.043855
72.181.105.81     0.017599
68.201.219.223    0.001502
24.26.242.9       0.001502
35.175.171.137    0.000143
52.87.230.102     0.000143
70.121.214.34     0.000143
95.31.16.121      0.000072
45.23.250.16      0.000072
3.88.129.158      0.000072
34.207.64.242     0.000072
54.172.14.223     0.000072
52.90.165.200     0.000072
97.105.15.120     0.000072
52.91.30.150      0.000072
54.145.52.184     0.000072
35.174.209.2      0.000072
95.31.18.119      0.000072
3.92.201.136      0.000072
34.229.70.250     0.000072
Name: ip, dtype: float64

In [8]:
def value_counts_and_frequencies(s: pd.Series, dropna=True) -> pd.DataFrame:
    return pd.merge(
        s.value_counts(dropna=False).rename('count'),
        s.value_counts(dropna=False, normalize=True).rename('proba'),
        left_index=True,
        right_index=True,
    )

In [9]:
# create a dataframe
ip_df = value_counts_and_frequencies(df.ip)
ip_df.head()

Unnamed: 0,count,proba
97.105.19.58,11999,0.85842
173.173.113.51,1059,0.075762
72.181.113.170,613,0.043855
72.181.105.81,246,0.017599
68.201.219.223,21,0.001502


In [10]:
# conditional probabilities
status_given_ip = (
    df.groupby('ip')
    .status_code.value_counts(normalize=True)
    .rename('proba_status_given_ip')
    .reset_index()
)
status_given_ip.head()

Unnamed: 0,ip,status_code,proba_status_given_ip
0,173.173.113.51,200,1.0
1,24.26.242.9,200,1.0
2,3.88.129.158,200,1.0
3,3.92.201.136,200,1.0
4,34.207.64.242,200,1.0


In [11]:
# looking for anomalies

status_given_ip[status_given_ip.proba_status_given_ip < 1]

Unnamed: 0,ip,status_code,proba_status_given_ip
17,72.181.113.170,200,0.996737
18,72.181.113.170,499,0.003263
22,97.105.19.58,200,0.998833
23,97.105.19.58,499,0.001167


In [12]:
df[df.ip == '72.181.113.170'].sort_values(by='status_code')

Unnamed: 0_level_0,ip,request_method,request_path,http_version,status_code,size,user_agent,size_mb
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-17 02:12:07,72.181.113.170,GET,/api/v1/items,HTTP/1.1,200,3561,python-requests/2.21.0,0.003396
2019-04-17 03:41:59,72.181.113.170,GET,/api/v1/sales?page=112,HTTP/1.1,200,510142,python-requests/2.21.0,0.486509
2019-04-17 03:42:01,72.181.113.170,GET,/api/v1/sales?page=113,HTTP/1.1,200,511094,python-requests/2.21.0,0.487417
2019-04-17 03:42:03,72.181.113.170,GET,/api/v1/sales?page=114,HTTP/1.1,200,511291,python-requests/2.21.0,0.487605
2019-04-17 03:42:05,72.181.113.170,GET,/api/v1/sales?page=115,HTTP/1.1,200,510140,python-requests/2.21.0,0.486507
...,...,...,...,...,...,...,...,...
2019-04-17 03:28:08,72.181.113.170,GET,/api/v1/sales?page=101,HTTP/1.1,200,510509,python-requests/2.21.0,0.486859
2019-04-17 03:28:11,72.181.113.170,GET,/api/v1/sales?page=102,HTTP/1.1,200,512612,python-requests/2.21.0,0.488865
2019-04-17 03:52:08,72.181.113.170,GET,/api/v1/sales?page=183,HTTP/1.1,200,308243,python-requests/2.21.0,0.293963
2019-04-17 03:42:44,72.181.113.170,GET,/api/v1/sales?page=133,HTTP/1.1,499,0,python-requests/2.21.0,0.000000


Cases where the probability is < 100%
Status codes other than 200

In [13]:
status_given_ip[status_given_ip.status_code != '200']

Unnamed: 0,ip,status_code,proba_status_given_ip
18,72.181.113.170,499,0.003263
19,95.31.16.121,301,1.0
21,97.105.15.120,301,1.0
23,97.105.19.58,499,0.001167


In [14]:
df.status_code.value_counts()

200    13960
499       16
301        2
Name: status_code, dtype: int64

In [15]:
df.columns

Index(['ip', 'request_method', 'request_path', 'http_version', 'status_code',
       'size', 'user_agent', 'size_mb'],
      dtype='object')

### some features to find probability
'request_method', 'request_path', 
'request_method','http_version'
'request_method','status_code'
'request_method','user_agent'
'request_path', 'http_version'
'http_version', 'status_code'
'http_version', 'user_agent'
'request_path', 'request_path', 
'request_path''status_code'
'status_code','user_agent'
'user_agent','ip'

('request_method', 'request_path', 'http_version', 'status_code','user_agent','ip')

In [16]:
# create a dataframe for request method
request_method_df = value_counts_and_frequencies(df.request_method)
request_method_df.head()

Unnamed: 0,count,proba
GET,13978,1.0


In [17]:
# so all request methods are the same so let's move on to the next feature

In [22]:
# create a dataframe for request path
request_path_df = value_counts_and_frequencies(df.request_path)
request_path_df.head()

Unnamed: 0,count,proba
/api/v1/sales?page=2,709,0.050723
/api/v1/items,464,0.033195
/api/v1/items?page=2,291,0.020818
/api/v1/items?page=3,219,0.015667
/api/v1/stores,162,0.01159


In [None]:
# looks like we have 5 different paths with sales page 2 having the most occurrences 

In [23]:
# conditional probabilities
status_given_request_path = (
    df.groupby('request_path')
    .status_code.value_counts(normalize=True)
    .rename('proba_status_given_request_path')
    .reset_index()
)
status_given_request_path.head()

Unnamed: 0,request_path,status_code,proba_status_given_request_path
0,/,200,1.0
1,/api/V1/HiZach!,200,1.0
2,/api/v1,200,1.0
3,/api/v1/,200,1.0
4,/api/v1//api/v1/items,200,1.0


In [29]:
# looking for anomalies by finding all instance where prob is less than 1 
status_given_request_path[status_given_request_path.proba_status_given_request_path < 1]

Unnamed: 0,request_path,status_code,proba_status_given_request_path
9,/api/v1/items,200,0.991379
10,/api/v1/items,499,0.008621
43,/api/v1/sales?page=115,200,0.983607
44,/api/v1/sales?page=115,499,0.016393
64,/api/v1/sales?page=133,200,0.983333
65,/api/v1/sales?page=133,499,0.016667
122,/api/v1/sales?page=2,200,0.995769
123,/api/v1/sales?page=2,499,0.004231
125,/api/v1/sales?page=21,200,0.986301
126,/api/v1/sales?page=21,499,0.013699


In [30]:
# this removes all anomalies where the status code equals 499 (all occurrences)
status_given_request_path[status_given_request_path.status_code != '499'].head()

Unnamed: 0,request_path,status_code,proba_status_given_request_path
0,/,200,1.0
1,/api/V1/HiZach!,200,1.0
2,/api/v1,200,1.0
3,/api/v1/,200,1.0
4,/api/v1//api/v1/items,200,1.0


In [20]:
# create a dataframe for http_version
http_version_df = value_counts_and_frequencies(df.http_version)
http_version_df.head()

Unnamed: 0,count,proba
HTTP/1.1,13978,1.0


In [None]:
# so all versions are the same also

In [31]:
# create a dataframe for user_agent
user_agent_df = value_counts_and_frequencies(df.user_agent)
user_agent_df.head()

Unnamed: 0,count,proba
python-requests/2.21.0,12005,0.85885
python-requests/2.20.1,1911,0.136715
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",34,0.002432
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0,8,0.000572
Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots),7,0.000501


In [None]:
# the above tells me there are 5 different types of user_agent 

In [None]:
# create a dataframe for status_code
status_code_df = value_counts_and_frequencies(df.status_code)
status_code_df.head()

In [33]:
# conditional probabilities
status_given_user_agent = (
    df.groupby('user_agent')
    .status_code.value_counts(normalize=True)
    .rename('proba_status_given_user_agent')
    .reset_index()
)
status_given_user_agent.head()

Unnamed: 0,user_agent,status_code,proba_status_given_user_agent
0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; ...,200,1.0
1,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,200,1.0
2,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,200,1.0
3,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4...,200,1.0
4,Python-urllib/3.7,200,1.0


In [34]:
# looking for anomalies by finding all instance where prob is less than 1 
status_given_user_agent[status_given_user_agent.proba_status_given_user_agent < 1]

Unnamed: 0,user_agent,status_code,proba_status_given_user_agent
7,python-requests/2.20.1,200,0.997907
8,python-requests/2.20.1,499,0.002093
9,python-requests/2.21.0,200,0.998834
10,python-requests/2.21.0,499,0.001
11,python-requests/2.21.0,301,0.000167


In [35]:
# so we can determine from above that anything other than entries with status code
# 200 are anomalies
# this removes all anomalies where the status code in not equal to 200 since 499 and
# 301 are anomalies
status_given_user_agent[status_given_user_agent.status_code == '200']

Unnamed: 0,user_agent,status_code,proba_status_given_user_agent
0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; ...,200,1.0
1,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,200,1.0
2,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3...,200,1.0
3,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4...,200,1.0
4,Python-urllib/3.7,200,1.0
5,Slackbot 1.0 (+https://api.slack.com/robots),200,1.0
6,Slackbot-LinkExpanding 1.0 (+https://api.slack...,200,1.0
7,python-requests/2.20.1,200,0.997907
9,python-requests/2.21.0,200,0.998834
