<h1> Saumay Agrawal </h1>
<h2> 16BCE1151 </h2>

In [1]:
import numpy as np
import pandas as pd
from time import mktime, strptime
from tabulate import tabulate

In [2]:
# Method to divide the logs sessions based on IP values
# to create sessions for different users
def divideByIP(log):
    sessions = []
    for ip in log.IP.unique():
        dftemp = log.loc[log.IP==ip, :]
        sessions.append(dftemp)
    return sessions

# Method to divide the logs sessions based on time using h1 value
# to create sessions for different users
def divideByH1(log, h1):
    h1 = h1*60
    start = [0, log.Epoch[log.index[0]]]
    sessions = []
    for index, row in log.iterrows():
        if (row['Epoch']-start[1])>h1:
            dftemp = log.loc[start[0]:index-1, :]
            start = [index, row['Epoch']]
            sessions.append(dftemp)
    dftemp = log.loc[start[0]:, :]
    sessions.append(dftemp)
    return sessions

# Method to sessionize the entire log
def getSessions(log, h1):
    sessions = []
    for x in divideByIP(log):
        sessions.append(divideByH1(x, h1))
    return sessions

# Method to covert time values into epoch values based on given format string
def addEpoch(log, timestr):
    log['Epoch'] = None
    log['Epoch'] = list(map(lambda x: mktime(strptime(x, timestr)), log['Time']))
    return log

# Method to shorten URL to 30 characters for ease of printing sessions
def shortenURL(log):
    log['URL'] = list(map(lambda x: x[:30], log['URL']))
    return log

# Method to print sessions
def printSessions(sessions):
    nusers = len(sessions)
    print('Sessions generated for {} users.\n'.format(nusers))
    for user in range(nusers):
        print('-'*100)
        print('Sessions of user #{}'.format(user+1))
        print('-'*100)
        print()
        nsess = len(sessions[user])
        for session in range(nsess):
            table = sessions[user][session].loc[:, ['IP', 'Time', 'URL']]
            table = tabulate(table.values, headers=['IP', 'Time', 'URL'], tablefmt='orgtbl')
            print(table)
            print()
#         print('='*100)
        print()

In [3]:

# Loading and preprocessing log file for question 1
log = pd.read_csv('dataset.csv', header=None)
log.columns=['IP', 'Time', 'URL', 'Status', 'Bytes']
log = shortenURL(log)
timestr = '[%d/%b/%Y:%H:%M:%S%z]'
log = addEpoch(log, timestr)
log.head()

Unnamed: 0,IP,Time,URL,Status,Bytes,Epoch
0,172.20.112.25,[02/Feb/2000:10:22:02-0500],GET /airmedia/images/welcome.g,200,3081.0,949467122.0
1,172.20.112.25,[02/Feb/2000:10:22:02-0500],GET /airmedia/images/edit_off.,200,268.0,949467122.0
2,172.20.112.25,[02/Feb/2000:10:22:02-0500],GET /airmedia/images/pubconten,200,1388.0,949467122.0
3,172.20.112.25,[02/Feb/2000:10:23:02-0500],GET /airmedia/gateway/1.htmlHT,304,252.0,949467182.0
4,172.20.112.25,[02/Feb/2000:10:23:02-0500],GET /airmedia/gateway/images/6,304,428.0,949467182.0


In [4]:
# Printing sessions from log file for question 1
h1 = 30
sessions = getSessions(log, h1)
printSessions(sessions)

Sessions generated for 2 users.

----------------------------------------------------------------------------------------------------
Sessions of user #1
----------------------------------------------------------------------------------------------------

              IP                         Time                             URL
0  172.20.112.25  [02/Feb/2000:10:22:02-0500]  GET /airmedia/images/welcome.g
1  172.20.112.25  [02/Feb/2000:10:22:02-0500]  GET /airmedia/images/edit_off.
2  172.20.112.25  [02/Feb/2000:10:22:02-0500]  GET /airmedia/images/pubconten
3  172.20.112.25  [02/Feb/2000:10:23:02-0500]  GET /airmedia/gateway/1.htmlHT
4  172.20.112.25  [02/Feb/2000:10:23:02-0500]  GET /airmedia/gateway/images/6
5  172.20.112.25  [02/Feb/2000:10:22:01-0500]  GET /airmedia/images/topic1_on
6  172.20.112.25  [02/Feb/2000:10:22:02-0500]  GET /airmedia/images/create_on

               IP                         Time                             URL
15  172.20.112.25  [02/Feb/2000:13:10:07

In [5]:
# Loading log file for question 2
log = pd.read_csv('weblog.csv')
log = shortenURL(log)
log.head()

Unnamed: 0,IP,Time,URL,Staus
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1,200
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1,302
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1,200
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js H,200
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootst,200


In [6]:
# Checking for null values
log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16007 entries, 0 to 16006
Data columns (total 4 columns):
IP       16007 non-null object
Time     16007 non-null object
URL      16007 non-null object
Staus    16007 non-null object
dtypes: object(4)
memory usage: 500.3+ KB


In [7]:
# Verifying the format of IP addresses
log.IP.unique()

array(['10.128.2.1', '10.131.2.1', '10.130.2.1', '10.129.2.1',
       '10.131.0.1', 'chmod:', 'rm:', 'timeout:', 'sh:', 'a.out:', '[Tue',
       '[Wed', '[Thu', '[Fri', '[Sat', '[Mon'], dtype=object)

In [8]:
# Keeping only valid IP addresses
log = log.loc[log.IP.isin(log.IP.unique()[:5])]
log.IP.unique()

array(['10.128.2.1', '10.131.2.1', '10.130.2.1', '10.129.2.1',
       '10.131.0.1'], dtype=object)

In [9]:
# Appending the epoch values
fstring = '[%d/%b/%Y:%H:%M:%S'
log['Epoch'] = list(map(lambda x: mktime(strptime(x, fstring)), log['Time']))
log.head()

Unnamed: 0,IP,Time,URL,Staus,Epoch
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1,200,1511919000.0
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1,302,1511919000.0
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1,200,1511919000.0
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js H,200,1511919000.0
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootst,200,1511919000.0


In [10]:
# Printing sessions for question 2
h1 = 30
sessions = getSessions(log, h1)
printSessions(sessions)

Sessions generated for 5 users.

----------------------------------------------------------------------------------------------------
Sessions of user #1
----------------------------------------------------------------------------------------------------

           IP                   Time                             URL
0  10.128.2.1  [29/Nov/2017:06:58:55         GET /login.php HTTP/1.1
1  10.128.2.1  [29/Nov/2017:06:59:02      POST /process.php HTTP/1.1
2  10.128.2.1  [29/Nov/2017:06:59:03          GET /home.php HTTP/1.1
6  10.128.2.1  [29/Nov/2017:06:59:19  GET /js/jquery.min.js HTTP/1.1

            IP                   Time                             URL
21  10.128.2.1  [29/Nov/2017:13:38:20  GET /css/font-awesome.min.css 
23  10.128.2.1  [29/Nov/2017:13:38:20     GET /css/style.css HTTP/1.1
27  10.128.2.1  [29/Nov/2017:13:38:21  GET /bootstrap-3.3.7/js/bootst
40  10.128.2.1  [29/Nov/2017:13:49:26  GET /countdown.php?name=RUET%2
41  10.128.2.1  [29/Nov/2017:13:49:32          G

              IP                   Time                       URL
8085  10.128.2.1  [12/Dec/2017:08:44:32  GET /robots.txt HTTP/1.1
8086  10.128.2.1  [12/Dec/2017:08:44:32            GET / HTTP/1.1
8087  10.128.2.1  [12/Dec/2017:08:44:32   GET /login.php HTTP/1.1

              IP                   Time                             URL
8089  10.128.2.1  [12/Dec/2017:09:23:24         GET /login.php HTTP/1.1
8090  10.128.2.1  [12/Dec/2017:09:23:24  GET /css/bootstrap.min.css HTT
8094  10.128.2.1  [12/Dec/2017:09:23:24     GET /css/style.css HTTP/1.1
8105  10.128.2.1  [12/Dec/2017:09:24:03  GET /js/vendor/moment.min.js H
8106  10.128.2.1  [12/Dec/2017:09:24:15       GET /archive.php HTTP/1.1
8107  10.128.2.1  [12/Dec/2017:09:24:18       GET /contest.php HTTP/1.1
8108  10.128.2.1  [12/Dec/2017:09:24:21  GET /contestproblem.php?name=R
8109  10.128.2.1  [12/Dec/2017:09:24:24  GET /standings.php?id=16 HTTP/
8110  10.128.2.1  [12/Dec/2017:09:24:36  GET /contestsubmission.php?id=
8111  10.128.2.

              IP                   Time                             URL
9223  10.128.2.1  [21/Dec/2017:22:22:58          GET /home.php HTTP/1.1
9225  10.128.2.1  [21/Dec/2017:22:22:59  GET /css/bootstrap.min.css HTT
9228  10.128.2.1  [21/Dec/2017:22:22:59      GET /css/main.css HTTP/1.1
9230  10.128.2.1  [21/Dec/2017:22:22:59  GET /js/vendor/modernizr-2.8.3
9233  10.128.2.1  [21/Dec/2017:22:23:02  GET /fonts/fontawesome-webfont
9234  10.128.2.1  [21/Dec/2017:22:23:04      GET /img/ruet.png HTTP/1.1
9235  10.128.2.1  [21/Dec/2017:22:23:50      POST /process.php HTTP/1.1
9236  10.128.2.1  [21/Dec/2017:22:23:50  GET /login.php?value=fail HTTP
9237  10.128.2.1  [21/Dec/2017:22:24:01      POST /process.php HTTP/1.1
9238  10.128.2.1  [21/Dec/2017:22:24:01  GET /login.php?value=fail HTTP
9253  10.128.2.1  [21/Dec/2017:22:25:11  GET /js/vendor/moment.min.js H

              IP                   Time                             URL
9258  10.128.2.1  [22/Dec/2017:18:59:23                  GET / 

               IP                   Time             URL
15502  10.128.2.1  [26/Feb/2018:00:44:50  GET / HTTP/1.1

               IP                   Time                             URL
15516  10.128.2.1  [26/Feb/2018:03:53:29      GET /css/main.css HTTP/1.1
15518  10.128.2.1  [26/Feb/2018:03:53:29     GET /css/style.css HTTP/1.1
15520  10.128.2.1  [26/Feb/2018:03:53:29  GET /bootstrap-3.3.7/js/bootst
15523  10.128.2.1  [26/Feb/2018:03:53:30      GET /img/ruet.png HTTP/1.1

               IP                   Time                             URL
15540  10.128.2.1  [26/Feb/2018:06:46:45                  GET / HTTP/1.1
15542  10.128.2.1  [26/Feb/2018:06:46:45  GET /css/bootstrap.min.css HTT
15544  10.128.2.1  [26/Feb/2018:06:46:47  GET /css/normalize.css HTTP/1.
15545  10.128.2.1  [26/Feb/2018:06:46:47      GET /css/main.css HTTP/1.1
15548  10.128.2.1  [26/Feb/2018:06:46:47                  GET / HTTP/1.1
15549  10.128.2.1  [26/Feb/2018:06:46:47  GET /js/vendor/jquery-1.12.0.m
15550  1

              IP                   Time                             URL
3695  10.131.2.1  [01/Dec/2017:07:03:31  GET /js/vendor/jquery-1.12.0.m
3702  10.131.2.1  [01/Dec/2017:07:03:37     GET /css/style.css HTTP/1.1
3718  10.131.2.1  [01/Dec/2017:07:03:43  GET /js/vendor/jquery-1.12.0.m
3719  10.131.2.1  [01/Dec/2017:07:03:43  GET /bootstrap-3.3.7/js/bootst
3724  10.131.2.1  [01/Dec/2017:07:04:09     GET /css/style.css HTTP/1.1
3725  10.131.2.1  [01/Dec/2017:07:04:09      GET /css/main.css HTTP/1.1
3730  10.131.2.1  [01/Dec/2017:07:04:09  GET /js/vendor/moment.min.js H
3736  10.131.2.1  [01/Dec/2017:07:04:31  GET /css/font-awesome.min.css 
3738  10.131.2.1  [01/Dec/2017:07:04:31  GET /bootstrap-3.3.7/js/bootst
3742  10.131.2.1  [01/Dec/2017:07:04:32  GET /css/font-awesome.min.css 
3745  10.131.2.1  [01/Dec/2017:07:04:32  GET /js/vendor/jquery-1.12.0.m
3751  10.131.2.1  [01/Dec/2017:07:04:32  GET /js/vendor/moment.min.js H

              IP                   Time                        


              IP                   Time                             URL
3691  10.130.2.1  [01/Dec/2017:07:03:31  GET /css/normalize.css HTTP/1.
3694  10.130.2.1  [01/Dec/2017:07:03:31     GET /css/style.css HTTP/1.1
3703  10.130.2.1  [01/Dec/2017:07:03:37  GET /js/vendor/moment.min.js H
3705  10.130.2.1  [01/Dec/2017:07:03:37  GET /js/vendor/jquery-1.12.0.m
3713  10.130.2.1  [01/Dec/2017:07:03:43  GET /js/vendor/modernizr-2.8.3
3716  10.130.2.1  [01/Dec/2017:07:03:43  GET /css/font-awesome.min.css 
3722  10.130.2.1  [01/Dec/2017:07:04:09  GET /js/vendor/modernizr-2.8.3
3727  10.130.2.1  [01/Dec/2017:07:04:09  GET /bootstrap-3.3.7/js/bootst
3731  10.130.2.1  [01/Dec/2017:07:04:31  GET /contestproblem.php?name=R
3732  10.130.2.1  [01/Dec/2017:07:04:31  GET /css/bootstrap.min.css HTT
3734  10.130.2.1  [01/Dec/2017:07:04:31  GET /js/vendor/modernizr-2.8.3
3735  10.130.2.1  [01/Dec/2017:07:04:31  GET /css/normalize.css HTTP/1.
3744  10.130.2.1  [01/Dec/2017:07:04:32      GET /css/main.css 


              IP                   Time                             URL
8693  10.130.2.1  [16/Dec/2017:16:52:53  GET /css/normalize.css HTTP/1.
8694  10.130.2.1  [16/Dec/2017:16:52:54     GET /css/style.css HTTP/1.1
8696  10.130.2.1  [16/Dec/2017:16:52:55      GET /css/main.css HTTP/1.1

              IP                   Time                             URL
8702  10.130.2.1  [16/Dec/2017:18:03:43          GET /home.php HTTP/1.1
8703  10.130.2.1  [16/Dec/2017:18:03:44         GET /login.php HTTP/1.1
8704  10.130.2.1  [16/Dec/2017:18:03:45  GET /css/bootstrap.min.css HTT
8709  10.130.2.1  [16/Dec/2017:18:03:48  GET /js/vendor/modernizr-2.8.3
8712  10.130.2.1  [16/Dec/2017:18:04:09  GET /fonts/fontawesome-webfont
8713  10.130.2.1  [16/Dec/2017:18:04:19         GET /login.php HTTP/1.1
8714  10.130.2.1  [16/Dec/2017:18:04:24      GET /img/ruet.png HTTP/1.1
8715  10.130.2.1  [16/Dec/2017:18:04:33         GET /index.php HTTP/1.1
8716  10.130.2.1  [16/Dec/2017:18:04:35         GET /login.php

               IP                   Time                             URL
14975  10.130.2.1  [22/Feb/2018:06:25:17  GET /js/vendor/modernizr-2.8.3
14990  10.130.2.1  [22/Feb/2018:06:26:33  GET /login.php?value=fail HTTP
14991  10.130.2.1  [22/Feb/2018:06:26:38      POST /process.php HTTP/1.1
14992  10.130.2.1  [22/Feb/2018:06:26:38  GET /login.php?value=fail HTTP
14997  10.130.2.1  [22/Feb/2018:06:30:04      POST /process.php HTTP/1.1
14998  10.130.2.1  [22/Feb/2018:06:30:04  GET /login.php?value=fail HTTP
14999  10.130.2.1  [22/Feb/2018:06:30:08      POST /process.php HTTP/1.1
15000  10.130.2.1  [22/Feb/2018:06:30:08  GET /login.php?value=fail HTTP
15001  10.130.2.1  [22/Feb/2018:06:30:13      POST /process.php HTTP/1.1
15002  10.130.2.1  [22/Feb/2018:06:30:13  GET /login.php?value=fail HTTP
15020  10.130.2.1  [22/Feb/2018:06:31:58    GET /adminpanel.php HTTP/1.1
15021  10.130.2.1  [22/Feb/2018:06:31:59          GET /home.php HTTP/1.1
15022  10.130.2.1  [22/Feb/2018:06:32:12          G

              IP                   Time                             URL
3331  10.129.2.1  [30/Nov/2017:18:40:28  GET /contestproblem.php?name=R
3332  10.129.2.1  [30/Nov/2017:18:40:33  GET /details.php?id=41 HTTP/1.
3333  10.129.2.1  [30/Nov/2017:18:40:36  GET /editcontestproblem.php?id
3334  10.129.2.1  [30/Nov/2017:18:40:41  GET /profile.php?user=shawon H
3336  10.129.2.1  [30/Nov/2017:18:40:54                  GET / HTTP/1.1
3349  10.129.2.1  [30/Nov/2017:18:42:11  GET /description.php?id=76 HTT
3350  10.129.2.1  [30/Nov/2017:18:42:18       GET /archive.php HTTP/1.1
3351  10.129.2.1  [30/Nov/2017:18:42:21  GET /allsubmission.php HTTP/1.
3352  10.129.2.1  [30/Nov/2017:18:42:27  GET /allsubmission.php?page=2 
3353  10.129.2.1  [30/Nov/2017:18:42:32  GET /profile.php?user=nasif25 
3354  10.129.2.1  [30/Nov/2017:18:42:46       GET /contest.php HTTP/1.1
3362  10.129.2.1  [30/Nov/2017:18:44:27  GET /description.php?id=76 HTT
3404  10.129.2.1  [30/Nov/2017:18:54:57  GET /contestproblem.php

              IP                   Time                             URL
3544  10.131.0.1  [30/Nov/2017:20:33:50       GET /contest.php HTTP/1.1
3547  10.131.0.1  [30/Nov/2017:20:33:53  GET /contestproblem.php?name=R
3548  10.131.0.1  [30/Nov/2017:20:33:59  GET /details.php?id=44 HTTP/1.
3549  10.131.0.1  [30/Nov/2017:20:34:22          GET /home.php HTTP/1.1
3563  10.131.0.1  [30/Nov/2017:20:56:38       GET /archive.php HTTP/1.1
3564  10.131.0.1  [30/Nov/2017:20:56:42  GET /archive.php?page=2 HTTP/1
3565  10.131.0.1  [30/Nov/2017:20:56:47          GET /home.php HTTP/1.1
3566  10.131.0.1  [30/Nov/2017:20:56:58       GET /contest.php HTTP/1.1
3567  10.131.0.1  [30/Nov/2017:20:57:01          GET /home.php HTTP/1.1

              IP                   Time                             URL
3590  10.131.0.1  [01/Dec/2017:00:41:00      POST /process.php HTTP/1.1
3591  10.131.0.1  [01/Dec/2017:00:41:00          GET /home.php HTTP/1.1
3592  10.131.0.1  [01/Dec/2017:00:41:05  GET /contestproblem.ph

              IP                   Time                             URL
8633  10.131.0.1  [16/Dec/2017:15:09:35         GET /login.php HTTP/1.1
8634  10.131.0.1  [16/Dec/2017:15:09:36  GET /css/bootstrap.min.css HTT
8635  10.131.0.1  [16/Dec/2017:15:09:36  GET /css/normalize.css HTTP/1.
8638  10.131.0.1  [16/Dec/2017:15:09:36      GET /css/main.css HTTP/1.1
8640  10.131.0.1  [16/Dec/2017:15:09:37  GET /js/vendor/modernizr-2.8.3
8642  10.131.0.1  [16/Dec/2017:15:09:37  GET /bootstrap-3.3.7/js/bootst
8643  10.131.0.1  [16/Dec/2017:15:09:40  GET /fonts/fontawesome-webfont
8644  10.131.0.1  [16/Dec/2017:15:09:48         GET /login.php HTTP/1.1
8645  10.131.0.1  [16/Dec/2017:15:09:53         GET /login.php HTTP/1.1

              IP                   Time                             URL
8652  10.131.0.1  [16/Dec/2017:16:20:40                  GET / HTTP/1.1
8653  10.131.0.1  [16/Dec/2017:16:20:40         GET /login.php HTTP/1.1
8654  10.131.0.1  [16/Dec/2017:16:20:41  GET /css/bootstrap.min

15084  10.131.0.1  [22/Feb/2018:10:40:44  GET /login.php HTTP/1.1

               IP                   Time                             URL
15090  10.131.0.1  [22/Feb/2018:12:41:22  GET /css/normalize.css HTTP/1.
15094  10.131.0.1  [22/Feb/2018:12:41:23  GET /bootstrap-3.3.7/js/bootst
15110  10.131.0.1  [22/Feb/2018:12:43:59         GET /login.php HTTP/1.1
15111  10.131.0.1  [22/Feb/2018:12:44:05      POST /process.php HTTP/1.1
15112  10.131.0.1  [22/Feb/2018:12:44:06          GET /home.php HTTP/1.1
15113  10.131.0.1  [22/Feb/2018:12:44:06  GET /bootstrap-3.3.7/js/bootst
15114  10.131.0.1  [22/Feb/2018:12:44:07  GET /js/vendor/moment.min.js H
15121  10.131.0.1  [22/Feb/2018:13:01:20  GET /css/font-awesome.min.css 
15122  10.131.0.1  [22/Feb/2018:13:01:21      GET /css/main.css HTTP/1.1
15126  10.131.0.1  [22/Feb/2018:13:01:21  GET /bootstrap-3.3.7/js/bootst
15127  10.131.0.1  [22/Feb/2018:13:01:21  GET /js/vendor/jquery-1.12.0.m
15128  10.131.0.1  [22/Feb/2018:13:01:23      GET /img/ru