In [1]:
#
# Import all packages
#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pydot

## Models used in the assignment
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, validation_curve
from sklearn.preprocessing import StandardScaler
from io import StringIO
from sklearn.tree import export_graphviz
from sklearn.metrics import silhouette_score#from apyori import apriori


import warnings
# To ignore any future warnings
warnings.filterwarnings("ignore")


In [2]:
# read the dataset and set skipinitialspace to true to be able to .replace
df = pd.read_csv('./Weblog.csv', skipinitialspace=True, encoding='latin')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15789 entries, 0 to 15788
Data columns (total 4 columns):
IP address    15789 non-null object
Timestamp     15789 non-null object
Request       15789 non-null object
Staus         15789 non-null int64
dtypes: int64(1), object(3)
memory usage: 493.5+ KB
None


In [3]:
# load logs from wdata
wdata = open('./Weblog.csv', 'r').readlines()

# print the first 3 lines
print('\n'.join(wdata[:3]))

IP address,Timestamp,Request,Staus

10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1,200

10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1,302



In [4]:
##Put the dataset into a table for easier viewing

# set names of pandas dataframe
names=['IP Address', 'TimeStamp', 'Request', 'Status']
# read the dataframe
df = pd.read_csv('./Weblog.csv', names=names, header=None)
df.drop(0, inplace=True)  # drop the row with index 0, on axis 0 (row-wise)
df.head()

Unnamed: 0,IP Address,TimeStamp,Request,Status
1,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1,200
2,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1,302
3,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1,200
4,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js HTTP/1.1,200
5,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1,200


In [5]:
#Preprocessing

#Split the request column into seperate and more readable columns.
def extract_method_and_protocol(row):
    # function to extract HTTP request method and protocol from a request string
    request_splits = row['Request'].split()  # split request string by space
    row['Method'] = request_splits[0]
    row['Protocol'] = request_splits[-1]
    row['Request'] = ' '.join(request_splits[1:-1])  # stitch remaining request string back
    return row

df = df.apply(extract_method_and_protocol, axis=1)

#Drop the protocol column
del df['Protocol']

# show the result
df.head()

Unnamed: 0,IP Address,TimeStamp,Request,Status,Method
1,10.128.2.1,[29/Nov/2017:06:58:55,/login.php,200,GET
2,10.128.2.1,[29/Nov/2017:06:59:02,/process.php,302,POST
3,10.128.2.1,[29/Nov/2017:06:59:03,/home.php,200,GET
4,10.131.2.1,[29/Nov/2017:06:59:04,/js/vendor/moment.min.js,200,GET
5,10.130.2.1,[29/Nov/2017:06:59:06,/bootstrap-3.3.7/js/bootstrap.js,200,GET


In [6]:
#Remove all unsuccessful requests
df['Status'] = df['Status'].astype(int)
df['Datetime'] = pd.to_datetime(df['TimeStamp'], format='[%d/%b/%Y:%H:%M:%S')  # set date time to pandas datatime obj
df = df.drop(['TimeStamp'], axis=1)

df = df[df['Status'] == 200]
print("After unsuccessful requests", len(df))

After unsuccessful requests 11330


In [7]:
from collections import defaultdict
import datetime

#Make a copy just in case
df2 = df.copy()
#Sort based upon the time
df2.sort_values(by='Datetime', inplace = True)
#initiate session ID and User ID to 0
session_id = 0
user_id = 0

# create a dictionaries to hold last access information
last_access = defaultdict(lambda:datetime.datetime.utcfromtimestamp(0))

# dictionary to find previous session, user ID and steps assigned to a specific date/ip/browser key
session_dict = defaultdict(lambda:1)
user_id_dict = defaultdict(lambda:1)
session_steps = defaultdict(lambda:1)

# function to be applied row wise
# for each row, produce session, user ID and path traversal
def get_log_user_info(row):
    # access global variables shared between all rows
    global session_id, user_id, session_dict, user_id_dict, session_steps, last_access
    
    session_key = str(row['Datetime'].date()) + '_' + row['IP Address']  # date + IP key for finding session
    user_key = str(row['Datetime'].date()) + '_' + row['IP Address'] # date + IP + browser key for finding user
    time_diff_session = row['Datetime'] - last_access[session_key]  # session time diff
    time_diff_user = row['Datetime'] - last_access[user_key]  # user time diff
    
    # if the time diff from previous session is > 30 mins, assign new session ID
    if time_diff_session.total_seconds() > 1800:
        session_id += 1
        session_dict[session_key] = session_id
    
    # if the time diff from previous session is > 60 mins, assign new user ID
    if time_diff_user.total_seconds() > 3600:
        user_id += 1
        user_id_dict[user_key] = user_id
        
    # update last access for session and user
    last_access[session_key] = row['Datetime']
    last_access[user_key] = row['Datetime']
    
    # assign extracted info from the row
    row['Session'] = session_dict[session_key]
    row['Step'] = session_steps[row['Session']]
    row['User_ID'] = user_id_dict[user_key]
    session_steps[row['Session']] += 1
    return row
# apply function above to get a new dataframe with added information
df2 = df2.apply(get_log_user_info, axis=1)


In [8]:
df.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11330 entries, 1 to 15789
Data columns (total 5 columns):
IP Address    11330 non-null object
Request       11330 non-null object
Status        11330 non-null int32
Method        11330 non-null object
Datetime      11330 non-null datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(3)
memory usage: 486.8+ KB


Unnamed: 0,IP Address,Request,Status,Method,Datetime,Session,Step,User_ID
4313,10.130.2.1,/,200,GET,2017-11-07 23:59:19,1,2,1
5431,10.130.2.1,/,200,GET,2017-11-07 23:59:19,1,3,1
5433,10.129.2.1,/login.php,200,GET,2017-11-08 00:39:07,2,1,2
4315,10.129.2.1,/login.php,200,GET,2017-11-08 00:39:07,2,2,2
5435,10.129.2.1,/login.php,200,GET,2017-11-08 01:36:22,3,1,2
