In [1]:
import pandas as pd
import arff
# read the file
print("Hello")
df = pd.read_csv("./data.csv", index_col="index")
print(df.head())

Hello
              host       time method                           url  response  \
index                                                                          
0      ***.novo.dk  805465029    GET                     /ksc.html       200   
1      ***.novo.dk  805465031    GET    /images/ksclogo-medium.gif       200   
2      ***.novo.dk  805465051    GET  /images/MOSAIC-logosmall.gif       200   
3      ***.novo.dk  805465053    GET     /images/USA-logosmall.gif       200   
4      ***.novo.dk  805465054    GET    /images/NASA-logosmall.gif       200   

       bytes  
index         
0       7067  
1       5866  
2        363  
3        234  
4        786  


In [2]:
# removing all not needed GET requests
remove_media = df[
    df.url.str.contains(".html", case=False) & df.url.str.contains(".htm", case=False)
    | df.url.str.endswith("/")
]
print("==================================")
# only 200
print(remove_media["response"].unique())
remove_media = remove_media[remove_media["response"] == 200]
print(remove_media["response"].unique())
print("==================================")
print(remove_media["method"].unique())
remove_media = remove_media[remove_media["method"] == "GET"]
print(remove_media["method"].unique())
print("==================================")


[200 304 501 404]
[200]
['GET' 'HEAD']
['GET']


In [3]:
# grouping by host (client user)
grouped_by_host = remove_media.groupby(by=["host"])
print(len(grouped_by_host))
# removing session with only one action
data = grouped_by_host.filter(lambda x: len(x) > 1)
print(len(data.groupby(by=["host"])))
print("==================================")
grouped_more_action = data.groupby(by=["host"])

divided_by_diff = (
    grouped_more_action["time"]
    .transform(
        lambda x: x.diff(),
    )
    .fillna(0)
)

33501
21292


In [4]:
data["difference"] = divided_by_diff

data["session_change"] = [
    True if diff > 30 * 60 else False for diff in data["difference"]
]
data["time_stamp"] = pd.to_datetime(data["time"], unit="s")

grouped_user_session = data.groupby(
    by=["host", pd.Grouper(key="time_stamp", freq="30min")]
)
data["session"] = grouped_user_session.ngroup()

grouped_by_session = data.groupby(by=["session"])
data = grouped_by_session.filter(lambda x: len(x) > 1)

data["session_actions"] = data.groupby(by=["session"])["session"].transform(
    lambda x: len(x)
)

print("==================================")





In [5]:
data = data.groupby(by=["session"]).filter(lambda x: len(x) > 1)
data["session_time"] = data.groupby(by=["session"])["time"].transform(
    lambda x: x.iloc[-1] - x.iloc[0]
)

print("==================================")



In [6]:
data["session_average_per_page"] = data.groupby(by=["session"])[
    "session_time"
].transform(lambda x: x / x.count())

print("==================================")



In [7]:
print(len(data.url.unique()))
max_length = data["time"].count()
data = data.groupby(by=["url"]).filter(
    lambda x: (x["time"].count() / max_length) >= 0.005
)
print(len(data.url.unique()))

print("==================================")

135
41


In [8]:
for site in data.url.unique():
    print(site)
    data[site] = data.groupby(by=["session"])["url"].transform(lambda x: site == x)
    print(data[site].unique())
# print(data.head(45))
# print(data.info())


/ksc.html
[ True False]
/shuttle/missions/missions.html
[False  True]
/shuttle/resources/orbiters/columbia.html
[False  True]
/shuttle/missions/sts-69/mission-sts-69.html
[False  True]
/shuttle/countdown/
[False  True]
/shuttle/countdown/liftoff.html
[False  True]
/shuttle/countdown/lps/fr.html
[False  True]
/shuttle/missions/sts-71/mission-sts-71.html
[False  True]
/
[False  True]
/history/history.html
[False  True]
/shuttle/countdown/countdown.html
[False  True]
/facilities/tour.html
[False  True]
/shuttle/missions/sts-70/mission-sts-70.html
[False  True]
/shuttle/missions/sts-71/movies/movies.html
[False  True]
/shuttle/resources/orbiters/discovery.html
[False  True]
/shuttle/resources/orbiters/endeavour.html
[False  True]
/shuttle/missions/sts-69/images/images.html
[False  True]
/history/apollo/apollo.html
[False  True]
/history/apollo/apollo-13/apollo-13.html
[False  True]
/shuttle/resources/orbiters/challenger.html
[False  True]
/shuttle/missions/51-l/mission-51-l.html
[False  Tr

In [9]:

print("CREATING ARFF SESSIONS")

chosen_pages = data.url.unique()
sessions = data.drop(
    ["method","time_stamp","response","time", "host", "url", "bytes", "difference", "session_change"], axis=1
)
# print(sessions.head())

grouped_sessions = sessions.groupby(by=["session"])
unique_sessions = pd.DataFrame()

s = []
t = []
a = []
p = []
all_sites = {}
for site in chosen_pages:
    all_sites[site] = []


for _, group in grouped_sessions:
    s.append(group["session"].unique()[0])
    a.append(group["session_actions"].unique()[0])
    t.append(group["session_time"].unique()[0])
    p.append(int(group["session_average_per_page"].unique()[0]))
    for site_name in all_sites:
        site_flags = any(group[site_name].unique())
        # print(site_flags)
        all_sites[site_name].append(site_flags)

unique_sessions["Session"] = s
unique_sessions["Actions"] = a
unique_sessions["Time"] = t
unique_sessions["PageAverageTime"] = p


for site_key in all_sites:
    unique_sessions[site_key] = all_sites[site_key]

print(unique_sessions.head())
print(unique_sessions.count())

arff.dump(
    "sessions_30_min.arff",
    unique_sessions.values,
    relation="sessions_30_min",
    names=unique_sessions.columns,
)

CREATING ARFF SESSIONS
   Session  Actions  Time  PageAverageTime  /ksc.html  \
0        0        3   352              117       True   
1        1        4   292               73      False   
2        2        3   360              120      False   
3        4        2    82               41      False   
4        6        2   107               53      False   

   /shuttle/missions/missions.html  /shuttle/resources/orbiters/columbia.html  \
0                             True                                       True   
1                            False                                      False   
2                            False                                      False   
3                            False                                      False   
4                            False                                      False   

   /shuttle/missions/sts-69/mission-sts-69.html  /shuttle/countdown/  \
0                                         False                False   
1  

In [13]:
# required for host to create new flags
for site in data.url.unique():
    print(site)
    data[site] = data.groupby(by=["host"])["url"].transform(lambda x: site == x)
    print(data[site].unique())

/ksc.html
[ True False]
/shuttle/missions/missions.html
[False  True]
/shuttle/resources/orbiters/columbia.html
[False  True]
/shuttle/missions/sts-69/mission-sts-69.html
[False  True]
/shuttle/countdown/
[False  True]
/shuttle/countdown/liftoff.html
[False  True]
/shuttle/countdown/lps/fr.html
[False  True]
/shuttle/missions/sts-71/mission-sts-71.html
[False  True]
/
[False  True]
/history/history.html
[False  True]
/shuttle/countdown/countdown.html
[False  True]
/facilities/tour.html
[False  True]
/shuttle/missions/sts-70/mission-sts-70.html
[False  True]
/shuttle/missions/sts-71/movies/movies.html
[False  True]
/shuttle/resources/orbiters/discovery.html
[False  True]
/shuttle/resources/orbiters/endeavour.html
[False  True]
/shuttle/missions/sts-69/images/images.html
[False  True]
/history/apollo/apollo.html
[False  True]
/history/apollo/apollo-13/apollo-13.html
[False  True]
/shuttle/resources/orbiters/challenger.html
[False  True]
/shuttle/missions/51-l/mission-51-l.html
[False  Tr

In [15]:
print("CREATING ARFF HOST")

chosen_pages = data.url.unique()

users = data.drop(
    [
        "session",
        "method",
        "time_stamp",
        "response",
        "time",
        "url",
        "bytes",
        "difference",
        "session_change",
    ],
    axis=1,
)

grouped_users = users.groupby(by=["host"])
unique_users = pd.DataFrame()
print(users.head())

h = []
sac = []
st = []
savg = []
all_sites_user = {}
for site in chosen_pages:
    all_sites_user[site] = []

for _, user_group in grouped_users:
    h.append(user_group["host"].unique()[0])
    savg.append(int(user_group["session_average_per_page"].unique()[0]))
    for site_name in all_sites_user:
        site_flags = any(user_group[site_name].unique())
        # print(site_flags)
        all_sites_user[site_name].append(site_flags)

unique_users["Host"] = h
unique_users["AvgTimePerPage"] = savg

for site_key in all_sites_user:
    unique_users[site_key] = all_sites_user[site_key]

print(unique_users.head())
print(any(unique_users))

arff.dump(
    "users_30_min.arff",
    unique_users.values,
    relation="users_30_min",
    names=unique_users.columns,
)


CREATING ARFF HOST
              host  session_actions  session_time  session_average_per_page  \
index                                                                         
0      ***.novo.dk                3           352                117.333333   
6      ***.novo.dk                3           352                117.333333   
12     ***.novo.dk                3           352                117.333333   
13     ***.novo.dk                4           292                 73.000000   
18     ***.novo.dk                4           292                 73.000000   

       /ksc.html  /shuttle/missions/missions.html  \
index                                               
0           True                            False   
6          False                             True   
12         False                            False   
13         False                            False   
18         False                            False   

       /shuttle/resources/orbiters/columbia.html  \
ind

In [12]:
for c in unique_users.columns:
    print(unique_users[c].unique())

['***.novo.dk' '007.thegap.com' '01-dynamic-c.wokingham.luna.net' ...
 '204.250.59.66' '204.250.60.62' '204.250.63.209']
[117 120  26  36  46  37  34  59  61  35  20  71 137  16  23  39  10  32
 238  54  78  21  66  41  42   3  11  22 138  28  31  70 183   1 140  80
  14 266 121 125  47   8  24  30  79  56  72 174  45   9  17 133 112  13
   6 225  29 168  90   4  44 173 326 180  18  38  58  48  40  19  99  27
  77  12  52  60 494 158  63 195  69  75 108 102  53  51   7 334 162   2
 127  15  33 254 122  67  96  50 144 110 146 211 333  25  49 336 132 129
  76 517 105 281 279 433 104  81 123  92 153 193  43  73 175  83 118 328
 296 156 189  62  55  64 190 276  91 451 124   5 116 126  74 319 286 244
 340 100 179  88 210  65 218 197 231 131 176  93 143 142  82 182 101  85
 247 448  84 119 113 270  98 165 206  95 114 185 234 222 228 515 107 139
 161 606 327 559 106 152 103 240 167 303 202 220 186 227 154 235 383 558
 135 434 388 297 111 330 394 145 316 295 324 215 213 290 229  87 275 321
 45