# Honeypot attacks exploration

## Loading data

In [1]:
# import libraries
import numpy as np
import pandas as pd
import glob
import os
import datetime
import IP2Location
import geoip2.database
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from scipy.stats import chisquare
%matplotlib inline

In [2]:
# データを読み込む　
# load data
cols = ['date', 'time', 'access_ip', 'host_ip', 'request_line', 'status_code', 'match_result', 'request_all']
df = pd.DataFrame()
for filename in glob.glob('data/*.txt'):
    df_new = pd.read_csv(filename, sep = ' ', header = None, names = cols, parse_dates = True)
    df = pd.concat([df, df_new])

## Assessing and cleaning data

In [3]:
# データのはじめの５行を表示する 
# show first 5 rows
df.head()

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all
0,[2020-03-01,00:01:37+0900],134.209.184.77,closedbeta.net:80,GET /blog/wp-login.php HTTP/1.1,200,1011,R0VUIC9ibG9nL3dwLWxvZ2luLnBocCBIVFRQLzEuMQpIb3...
1,[2020-03-01,00:01:38+0900],134.209.184.77,closedbeta.net:80,POST /blog/closedbeta.net/blog/wp-login.php HT...,200,1011,UE9TVCAvYmxvZy9jbG9zZWRiZXRhLm5ldC9ibG9nL3dwLW...
2,[2020-03-01,00:11:53+0900],193.106.30.99,closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogY2xvc2VkYmV0YS5uZX...
3,[2020-03-01,00:19:47+0900],165.22.222.119,closedbeta.net:80,POST /code.conflicts.php HTTP/1.1,200,1037,UE9TVCAvY29kZS5jb25mbGljdHMucGhwIEhUVFAvMS4xCk...
4,[2020-03-01,00:19:55+0900],189.240.124.61,closedbeta.net:80,POST /work.clases.php HTTP/1.1,200,1037,UE9TVCAvd29yay5jbGFzZXMucGhwIEhUVFAvMS4xCkFjY2...


In [4]:
# size of the dataset
df.shape

(563883, 8)

In [5]:
# basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 563883 entries, 0 to 199532
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   date          563883 non-null  object
 1   time          563883 non-null  object
 2   access_ip     563883 non-null  object
 3   host_ip       563883 non-null  object
 4   request_line  563883 non-null  object
 5   status_code   563883 non-null  int64 
 6   match_result  563883 non-null  object
 7   request_all   563883 non-null  object
dtypes: int64(1), object(7)
memory usage: 38.7+ MB


No missing values. 欠損値がない

In [6]:
# check duplicates　重複した値（あたい
df.duplicated().sum()

499

重複した値は４９９行あります

In [7]:
# duplicated records
df[df.duplicated()]

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all
962,[2020-03-02,12:50:29+0900],81.180.119.230,blank:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKCg==
1481,[2020-03-03,02:22:23+0900],178.128.23.112,closedbeta.net:80,GET /wp-admin/profile.php HTTP/1.1,200,False,R0VUIC93cC1hZG1pbi9wcm9maWxlLnBocCBIVFRQLzEuMQ...
1597,[2020-03-03,04:41:51+0900],100.11.48.113,blank:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKCg==
1599,[2020-03-03,04:41:52+0900],100.11.48.113,blank:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKCg==
1886,[2020-03-03,17:26:31+0900],142.93.87.106,closedbeta.net:80,GET / HTTP/1.0,200,False,R0VUIC8gSFRUUC8xLjAKSG9zdDogY2xvc2VkYmV0YS5uZX...
...,...,...,...,...,...,...,...,...
197143,[2020-06-29,01:44:35+0900],62.210.180.62,mta-sts.closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogbXRhLXN0cy5jbG9zZW...
197256,[2020-06-29,03:03:34+0900],62.210.172.100,backup.closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogYmFja3VwLmNsb3NlZG...
197718,[2020-06-29,07:13:32+0900],111.231.200.218,120.51.157.176:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKQWNjZXB0OiAqLyoKQWNjZXB0LU...
198037,[2020-06-29,11:31:18+0900],62.210.141.218,backup.closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogYmFja3VwLmNsb3NlZG...


In [8]:
# take a look at one of the duplicated records
df[(df.time == '12:50:29+0900') & (df.access_ip == '81.180.119.230')]

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all


In [9]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [10]:
# double check duplicates
df.duplicated().sum()

0

In [11]:
# clean the date column
df['date'] = df.date.str.lstrip('[')
df.head()

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all
0,2020-03-01,00:01:37+0900],134.209.184.77,closedbeta.net:80,GET /blog/wp-login.php HTTP/1.1,200,1011,R0VUIC9ibG9nL3dwLWxvZ2luLnBocCBIVFRQLzEuMQpIb3...
1,2020-03-01,00:01:38+0900],134.209.184.77,closedbeta.net:80,POST /blog/closedbeta.net/blog/wp-login.php HT...,200,1011,UE9TVCAvYmxvZy9jbG9zZWRiZXRhLm5ldC9ibG9nL3dwLW...
2,2020-03-01,00:11:53+0900],193.106.30.99,closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogY2xvc2VkYmV0YS5uZX...
3,2020-03-01,00:19:47+0900],165.22.222.119,closedbeta.net:80,POST /code.conflicts.php HTTP/1.1,200,1037,UE9TVCAvY29kZS5jb25mbGljdHMucGhwIEhUVFAvMS4xCk...
4,2020-03-01,00:19:55+0900],189.240.124.61,closedbeta.net:80,POST /work.clases.php HTTP/1.1,200,1037,UE9TVCAvd29yay5jbGFzZXMucGhwIEhUVFAvMS4xCkFjY2...


In [12]:
# clean the time column
df['time'] = df.time.str.rstrip(']')
df.head()

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all
0,2020-03-01,00:01:37+0900,134.209.184.77,closedbeta.net:80,GET /blog/wp-login.php HTTP/1.1,200,1011,R0VUIC9ibG9nL3dwLWxvZ2luLnBocCBIVFRQLzEuMQpIb3...
1,2020-03-01,00:01:38+0900,134.209.184.77,closedbeta.net:80,POST /blog/closedbeta.net/blog/wp-login.php HT...,200,1011,UE9TVCAvYmxvZy9jbG9zZWRiZXRhLm5ldC9ibG9nL3dwLW...
2,2020-03-01,00:11:53+0900,193.106.30.99,closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogY2xvc2VkYmV0YS5uZX...
3,2020-03-01,00:19:47+0900,165.22.222.119,closedbeta.net:80,POST /code.conflicts.php HTTP/1.1,200,1037,UE9TVCAvY29kZS5jb25mbGljdHMucGhwIEhUVFAvMS4xCk...
4,2020-03-01,00:19:55+0900,189.240.124.61,closedbeta.net:80,POST /work.clases.php HTTP/1.1,200,1037,UE9TVCAvd29yay5jbGFzZXMucGhwIEhUVFAvMS4xCkFjY2...


## Feture engineering

### month, day, hour_JP

In [13]:
# extract month, day from date
df['month'] = df['date'].apply(lambda x: x[5:7]).astype(int)
month_dict = {1: 'Janurary', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August'}
df['month'] = df.month.map(month_dict)
df['day'] = df['date'].apply(lambda x: x[8:10]).astype(int)
df.head()

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all,month,day
0,2020-03-01,00:01:37+0900,134.209.184.77,closedbeta.net:80,GET /blog/wp-login.php HTTP/1.1,200,1011,R0VUIC9ibG9nL3dwLWxvZ2luLnBocCBIVFRQLzEuMQpIb3...,March,1
1,2020-03-01,00:01:38+0900,134.209.184.77,closedbeta.net:80,POST /blog/closedbeta.net/blog/wp-login.php HT...,200,1011,UE9TVCAvYmxvZy9jbG9zZWRiZXRhLm5ldC9ibG9nL3dwLW...,March,1
2,2020-03-01,00:11:53+0900,193.106.30.99,closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogY2xvc2VkYmV0YS5uZX...,March,1
3,2020-03-01,00:19:47+0900,165.22.222.119,closedbeta.net:80,POST /code.conflicts.php HTTP/1.1,200,1037,UE9TVCAvY29kZS5jb25mbGljdHMucGhwIEhUVFAvMS4xCk...,March,1
4,2020-03-01,00:19:55+0900,189.240.124.61,closedbeta.net:80,POST /work.clases.php HTTP/1.1,200,1037,UE9TVCAvd29yay5jbGFzZXMucGhwIEhUVFAvMS4xCkFjY2...,March,1


In [14]:
# extract hour from time (JP local time)
df['hour_JP'] = df['time'].apply(lambda x: x[0:2]).astype(int)
df.head()

Unnamed: 0,date,time,access_ip,host_ip,request_line,status_code,match_result,request_all,month,day,hour_JP
0,2020-03-01,00:01:37+0900,134.209.184.77,closedbeta.net:80,GET /blog/wp-login.php HTTP/1.1,200,1011,R0VUIC9ibG9nL3dwLWxvZ2luLnBocCBIVFRQLzEuMQpIb3...,March,1,0
1,2020-03-01,00:01:38+0900,134.209.184.77,closedbeta.net:80,POST /blog/closedbeta.net/blog/wp-login.php HT...,200,1011,UE9TVCAvYmxvZy9jbG9zZWRiZXRhLm5ldC9ibG9nL3dwLW...,March,1,0
2,2020-03-01,00:11:53+0900,193.106.30.99,closedbeta.net:80,GET / HTTP/1.1,200,False,R0VUIC8gSFRUUC8xLjEKSG9zdDogY2xvc2VkYmV0YS5uZX...,March,1,0
3,2020-03-01,00:19:47+0900,165.22.222.119,closedbeta.net:80,POST /code.conflicts.php HTTP/1.1,200,1037,UE9TVCAvY29kZS5jb25mbGljdHMucGhwIEhUVFAvMS4xCk...,March,1,0
4,2020-03-01,00:19:55+0900,189.240.124.61,closedbeta.net:80,POST /work.clases.php HTTP/1.1,200,1037,UE9TVCAvd29yay5jbGFzZXMucGhwIEhUVFAvMS4xCkFjY2...,March,1,0


### country, country_code, ip_country

In [16]:
# load the IP2Location database
database = IP2Location.IP2Location(os.path.join("data", "IP2LOCATION-LITE-DB11.IPV6.BIN"))

In [None]:
# map ip to country
df['country'] = df.access_ip.apply(lambda ip: database.get_all(ip).country_long)
df.head(1)

In [None]:
# map ip to country
df['country_code'] = df.access_ip.apply(lambda ip: database.get_all(ip).country_short)
df.head(1)

In [None]:
# combine ip with the country code
df['ip_country'] = df['access_ip'] + '-' + df['country_code']
df.head(1)

### region, city

In [None]:
# map ip to region
df['region'] = df.access_ip.apply(lambda ip: database.get_all(ip).region)
df.head(1)

In [None]:
# map ip to city
df['city'] = df.access_ip.apply(lambda ip: database.get_all(ip).city)
df.head(1)

### latitude, longitude

In [None]:
# get the latitude and longitude of the IP
df['latitude'] = df.access_ip.apply(lambda ip: database.get_all(ip).latitude)
df['longitude'] = df.access_ip.apply(lambda ip: database.get_all(ip).longitude)
df.head(1)

### timezone, hour_Local

In [None]:
# get the time zone of ip
df['timezone'] = df.access_ip.apply(lambda ip: database.get_all(ip).timezone)
df.head(1)

In [None]:
# value_count of timezone
df.timezone.value_counts()

In [None]:
# drop the record with a value '-' for timezone
df = df[df.timezone != '-']
df.timezone.value_counts()

In [None]:
# extract only the integer of the timezone
df['timezone'] = df.timezone.apply(lambda x: int(x.split(':')[0]))

In [None]:
# define a function to get the local time
def get_local_hour(hour_JP, timezone, timezone_JP=9):
    """
    this function get the local hour from hour in JP, timezone in JP and local timezone

    input arguments:

    hour_JP: the hour in JP
    timezone: the local timezone
    timezoneJP: the timezone in Japan, default, 9

    return:

    the hour in local time
    """
    local_hour = hour_JP - (timezone_JP - timezone)
    if local_hour < 0:
        local_hour += 24
    elif local_hour > 24:
        local_hour -= 24
    return local_hour
     

In [None]:
# test the get_local_hour function
get_local_hour(13, 8) == 12, get_local_hour(13, 0) == 4, get_local_hour(13, -5) == 23


In [None]:
# apply the get_local_hour function to the dataframe to get the local hour of the access
df['hour_Local'] = df.apply(lambda x: get_local_hour(x['hour_JP'], x['timezone']), axis=1)
df.head()

### method, http_version

In [None]:
# create a method column from request_line
df['method'] = df['request_line'].apply(lambda x: x.split(' ')[0])
df.head(1)

In [None]:
# get the HTTP version from request_line
df['http_version'] = df['request_line'].apply(lambda x: x.split(' ')[-1])
df.head(1)

## Exploratory Data Analysis

### date

In [None]:
# convert the data type of date from object to datetime
df['date'] = pd.to_datetime(df['date'])
df.info()

In [None]:
# value counts of date
df.date.value_counts()

In [None]:
# line plot of access trend over time
plt.figure(figsize=(18,6))
df.date.value_counts().plot()
plt.xlabel('Date')
plt.ylabel('Number of Access')
plt.title('Number of Attacks Over Time')
plt.savefig('plot/1.pdf');



### access_ip

In [None]:
# value counts
df.access_ip.value_counts()

In [None]:
# number of unique access ip
print('There are {} IPs in in total'.format(df.access_ip.nunique()))

### request_line

In [None]:
# value counts

df.request_line.value_counts()

### status_code

In [None]:
# value counts
df.status_code.value_counts()

### match_result

In [None]:
# value count
df.match_result.value_counts()

### month

In [None]:
# value count
df.month.value_counts()

In [None]:
# bar chart
plt.figure(figsize=(10,6))
df.month.value_counts().plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Number of Access')
plt.title('Number of Access by Month');

### day

In [None]:
# value count
df.day.value_counts()

In [None]:
# plot of Number of Access Over Days
df_day = df.day.value_counts().reset_index()
df_day.columns = ['day', 'count']
plt.figure(figsize=(10, 6))
sns.lineplot(x='day', y='count', data=df_day)
plt.xlabel('Day')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Days');

### hour_JP 

In [None]:
# value count
df.hour_JP.value_counts()

In [None]:
# plot of number of access over hour in japan time
df_hour_JP = df.hour_JP.value_counts().reset_index()
df_hour_JP.columns = ['hour_JP', 'count']
plt.figure(figsize=(10, 6))
sns.lineplot(x='hour_JP', y='count', data=df_hour_JP)
plt.xlabel('Hour in Japan Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in Japan Time');

### hour_Local

In [None]:
# value count
df.hour_Local.value_counts()

In [None]:
# plot of number of access over hour in local time
df_hour_Local = df.hour_Local.value_counts().reset_index()
df_hour_Local.columns = ['hour_Local', 'count']
plt.figure(figsize=(10, 6))
sns.lineplot(x='hour_Local', y='count', data=df_hour_Local)
plt.xlabel('Hour in Local Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in Local Time');

### country

In [None]:
# value counts
df.country.value_counts()

In [None]:
# unique values
df.country.unique()

In [None]:
# top 10 country
df_country = df.country.value_counts().reset_index()
df_country.columns = ['country', 'count']
top10_country = df_country.iloc[0:10, ]
top10_country.at[0, 'country'] = 'US'
top10_country.at[9, 'country'] = 'UK'
top10_country

In [None]:
# plot of top 10 number of access by country
plt.figure(figsize=(10,6))
plt.bar(x = top10_country['country'], height = top10_country['count'])
plt.xlabel('Country')
plt.ylabel('Number of Access')
plt.title('Top 10 Number of Access by Country');


### region

In [None]:
# value count

df.region.value_counts()

In [None]:
# number of unique regions

df.region.nunique()

In [None]:
# top 10 region
df_region = df.region.value_counts().reset_index()
df_region.columns = ['region', 'count']
top10_region = df_region.iloc[0:10, ]
top10_region

In [None]:
# plot of top 10 number of access by region
plt.figure(figsize=(10,6))
plt.bar(x = top10_region['region'], height = top10_region['count'])
plt.xlabel('Region')
plt.ylabel('Number of Access')
plt.title('Top 10 Number of Access by Region')
plt.xticks(rotation=15);

### city

In [None]:
# value counts
df.city.value_counts()

In [None]:
# number of unique cities
df.city.nunique()

In [None]:
# top 10 city
df_city = df.city.value_counts().reset_index()
df_city.columns = ['city', 'count']
top10_city = df_city.iloc[0:10, ]
top10_city

In [None]:
# plot of top 10 number of access by city
plt.figure(figsize=(10,6))
plt.bar(x = top10_city['city'], height = top10_city['count'])
plt.xlabel('City')
plt.ylabel('Number of Access')
plt.title('Top 10 Number of Access by City')
plt.xticks(rotation=15);

### world map - Tableau

### method

In [None]:
# value counts
df.method.value_counts()

In [None]:
# bar chart of number of requests by method
plt.figure(figsize=(10, 6))
df.method.value_counts().plot(kind='bar')
plt.xlabel('Method')
plt.ylabel('Number of Requests')
plt.title('Number of Requests by Method')
plt.xticks(rotation = 45);

### http version

In [None]:
# value counts
df['http_version'].value_counts()

In [None]:
# bar chart of proportion of attacks by http version
plt.figure(figsize=(6, 6))
(df['http_version'].value_counts() / df.shape[0]).plot(kind = 'bar')
plt.xlabel('HTTP Version')
plt.ylabel('Proportion')
plt.title('Proportion of Attacks by Http Version')
plt.xticks(rotation = 0);

## Time Related Variables Exploration

In this part, we explore time related variables, including date, month, day, hour_JP, hour_Local

### date

In [None]:
# line plot of access trend over time
plt.figure(figsize=(18,6))
df.date.value_counts().plot()
plt.xlabel('Date')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Time')
plt.savefig('plot/1.png');
# plt.setp(plt.gca().xaxis.get_majorticklabels(),
#          'rotation', 90)
# plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=10))

A peak appears at around 2020-06-21.

### month

In [None]:
# bar chart
plt.figure(figsize=(10,6))
(df.month.value_counts()/df.shape[0]).plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Proportion')
plt.title('Proportion of Access by Month')
plt.xticks(rotation=0)
plt.savefig('plot/2.png');

June has the most attacks, June/July and Aug have more attacks than the other months.

In [None]:
# one way chi square test

chisq, p = chisquare(df.month.value_counts())

print('The p value is {}\n'.format(p))
if p < 0.05:
    print('p is very small, there is a significant difference')
else:
    print('p is relatively large, there is no significant difference')

### day

In [None]:
# plot of Number of Access Over Days
df_day = df.day.value_counts().reset_index()
df_day.columns = ['day', 'count']
plt.figure(figsize=(18, 6))
sns.lineplot(x='day', y='count', data=df_day)
plt.xlabel('Day')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Days')
plt.xticks(np.arange(0, 32, 2))
plt.savefig('plot/3.png');

Overall, more attacks happen on 12 and 21.

### hour_JP

In [None]:
# plot of number of access over hour in japan time
df_hour_JP = df.hour_JP.value_counts().reset_index()
df_hour_JP.columns = ['hour_JP', 'count']
plt.figure(figsize=(18, 6))
sns.lineplot(x='hour_JP', y='count', data=df_hour_JP)
plt.xlabel('Hour in Japan Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in Japan Time')
plt.xticks(np.arange(0, 24, 1))
plt.savefig('plot/4.png');

The park appears at 2AM, 10AM and 14PM

### hour_Local

In [None]:
# plot of number of access over hour in local time
df_hour_Local = df.hour_Local.value_counts().reset_index()
df_hour_Local.columns = ['hour_Local', 'count']
plt.figure(figsize=(18, 6))
sns.lineplot(x='hour_Local', y='count', data=df_hour_Local)
plt.xlabel('Hour in Local Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in Local Time')
plt.xticks(np.arange(0, 24, 1))
plt.savefig('plot/5.png');

The peak appears at 4AM, 6AM and 20pm

In [None]:
df.country.unique()

### hour_Local - US

In [None]:
# plot of number of access over hour in US local time
df_hour_US_Local = df[df.country=='United States of America'].hour_Local.value_counts().reset_index()
df_hour_US_Local.columns = ['hour_Local', 'count']
plt.figure(figsize=(18, 6))
sns.lineplot(x='hour_Local', y='count', data=df_hour_US_Local)
plt.xlabel('Hour in Local Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in US Local Time')
plt.xticks(np.arange(0, 24, 1))
plt.savefig('plot/11.png');

For attacks from US, the peaks are at 1AM and 22PM

### hour_Local - Ireland

In [None]:
# plot of number of access over hour in Ireland local time
df_hour_Ireland_Local = df[df.country=='Ireland'].hour_Local.value_counts().reset_index()
df_hour_Ireland_Local.columns = ['hour_Local', 'count']
plt.figure(figsize=(18, 6))
sns.lineplot(x='hour_Local', y='count', data=df_hour_Ireland_Local)
plt.xlabel('Hour in Local Time')
plt.ylabel('Number of Access')
plt.title('Number of Access Over Hour in Ireland Local Time')
plt.xticks(np.arange(0, 24, 1))
plt.savefig('plot/12.png');

For attacks from Ireland, the peaks appear at 8AM, 19PM and 21PM

## Location Related Variables Exploration

In this part, we explore location related variables, including ip_country, country, city

### ip_country

In [None]:
# top 10 IPs
df.groupby(['ip_country', 'country']).size().sort_values(ascending=False).head(10)

In [None]:
# bar chart of top 10 IPs
plt.figure(figsize=(12, 4))
df.ip_country.value_counts().head(10).sort_values().plot(kind='barh')
plt.xlabel('Number of Attacks')
plt.ylabel('IP')
plt.title('Horizantle Bar Chart of Access by IP')
plt.savefig('plot/6.png');

### country

In [None]:
# top 10 country
df_country = df.country.value_counts().reset_index()
df_country.columns = ['country', 'count']
top10_country = df_country.iloc[0:10, ]
top10_country.at[0, 'country'] = 'US'
top10_country.at[9, 'country'] = 'UK'
top10_country

In [None]:
# plot of top 10 number of access by country
plt.figure(figsize=(10,6))
plt.bar(x = top10_country['country'], height = top10_country['count'])
plt.xlabel('Country')
plt.ylabel('Number of Access')
plt.title('Top 10 Number of Access by Country')
plt.savefig('plot/7.png');

### city

In [None]:
# top 10 city
df_city = df.city.value_counts().reset_index()
df_city.columns = ['city', 'count']
top10_city = df_city.iloc[0:10, ]
top10_city

In [None]:
# plot of top 10 number of access by city
plt.figure(figsize=(10,6))
plt.bar(x = top10_city['city'], height = top10_city['count'])
plt.xlabel('City')
plt.ylabel('Number of Access')
plt.title('Top 10 Number of Access by City')
plt.xticks(rotation=15)
plt.savefig('plot/8.png');

## Technique Related Variables Exploration

In this part, we explore technique related variables, including method and http_version

### method


In [None]:
# bar chart of number of requests by method
plt.figure(figsize=(10, 6))
df.method.value_counts().plot(kind='bar')
plt.xlabel('Method')
plt.ylabel('Number of Access')
plt.title('Number of Access by Method')
plt.xticks(rotation = 45)
plt.savefig('plot/9.png');

### http_version

In [None]:
# bar chart of proportion of attacks by http version
plt.figure(figsize=(6, 6))
(df['http_version'].value_counts() / df.shape[0]).plot(kind = 'bar')
plt.xlabel('HTTP Version')
plt.ylabel('Proportion')
plt.title('Proportion of Access by Http Version')
plt.xticks(rotation = 0)
plt.yticks(np.arange(0, 1.01, 0.05))
plt.savefig('plot/10.png');

In [None]:
# export the dataframe as csv file
df.to_csv('data/honeypot.csv')

## Reference

- [IP2Location™ LITE IP-COUNTRY-REGION-CITY-LATITUDE-LONGITUDE-ZIPCODE-TIMEZONE Database](https://lite.ip2location.com/database/ip-country-region-city-latitude-longitude-zipcode-timezone)
- [ValueError: invalid literal for int() with base 10](https://stackoverflow.com/questions/1841565/valueerror-invalid-literal-for-int-with-base-10)
- [python pandas- apply function with two arguments to columns](https://stackoverflow.com/questions/34279378/python-pandas-apply-function-with-two-arguments-to-columns)