In [13]:
import pandas as pd
import arff
# read the file
print("Hello")
df = pd.read_csv("./data.csv", index_col="index")
print(df.head())

Hello
              host       time method                           url  response  \
index                                                                          
0      ***.novo.dk  805465029    GET                     /ksc.html       200   
1      ***.novo.dk  805465031    GET    /images/ksclogo-medium.gif       200   
2      ***.novo.dk  805465051    GET  /images/MOSAIC-logosmall.gif       200   
3      ***.novo.dk  805465053    GET     /images/USA-logosmall.gif       200   
4      ***.novo.dk  805465054    GET    /images/NASA-logosmall.gif       200   

       bytes  
index         
0       7067  
1       5866  
2        363  
3        234  
4        786  


In [14]:
# removing all not needed GET requests
remove_media = df[
    df.url.str.contains(".html", case=False) & df.url.str.contains(".htm", case=False)
    | df.url.str.endswith("/")
]
print("==================================")
# only 200
print(remove_media["response"].unique())
remove_media = remove_media[remove_media["response"] == 200]
print(remove_media["response"].unique())
print("==================================")
print(remove_media["method"].unique())
remove_media = remove_media[remove_media["method"] == "GET"]
print(remove_media["method"].unique())
print("==================================")


[200 304 501 404]
[200]
['GET' 'HEAD']
['GET']


In [15]:
# grouping by host (client user)
grouped_by_host = remove_media.groupby(by=["host"])
print(len(grouped_by_host))
# removing session with only one action
data = grouped_by_host.filter(lambda x: len(x) > 1)
print(len(data.groupby(by=["host"])))
print("==================================")
grouped_more_action = data.groupby(by=["host"])

divided_by_diff = (
    grouped_more_action["time"]
    .transform(
        lambda x: x.diff(),
    )
    .fillna(0)
)

33501
21292


In [16]:
data["difference"] = divided_by_diff

data["session_change"] = [
    True if diff > 30 * 60 else False for diff in data["difference"]
]
data["time_stamp"] = pd.to_datetime(data["time"], unit="s")

grouped_user_session = data.groupby(
    by=["host", pd.Grouper(key="time_stamp", freq="30min")]
)
data["session"] = grouped_user_session.ngroup()

grouped_by_session = data.groupby(by=["session"])
data = grouped_by_session.filter(lambda x: len(x) > 1)

data["session_actions"] = data.groupby(by=["session"])["session"].transform(
    lambda x: len(x)
)

print("==================================")





In [17]:
data = data.groupby(by=["session"]).filter(lambda x: len(x) > 1)
data["session_time"] = data.groupby(by=["session"])["time"].transform(
    lambda x: x.iloc[-1] - x.iloc[0]
)

print("==================================")



In [18]:
data["session_average_per_page"] = data.groupby(by=["session"])[
    "session_time"
].transform(lambda x: x / x.count())

print("==================================")



In [19]:
print(len(data.url.unique()))
max_length = data["time"].count()
data = data.groupby(by=["url"]).filter(
    lambda x: (x["time"].count() / max_length) >= 0.005
)
print(len(data.url.unique()))

print("==================================")

135
41


In [20]:
for site in data.url.unique():
    # print(site)
    data[site] = data.groupby(by=["session"])["url"].transform(lambda x: site == x)
    # print(data[site].unique())
# print(data.head(45))
# print(data.info())


In [21]:

print("CREATING ARFF SESSIONS")
chosen_pages = data.url.unique()
sessions = data.drop(
    ["method","time_stamp","response","time", "host", "url", "bytes", "difference", "session_change"], axis=1
)
# print(sessions.head())

grouped_sessions = sessions.groupby(by=["session"])
unique_sessions = pd.DataFrame()

s = []
t = []
a = []
p = []
all_sites = {}
for site in chosen_pages:
    all_sites[site] = []


for _, group in grouped_sessions:
    s.append(group["session"].unique()[0])
    a.append(group["session_actions"].unique()[0])
    t.append(group["session_time"].unique()[0])
    p.append(group["session_average_per_page"].unique()[0])
    for site_name in all_sites:
        all_sites[site_name] = any(group[site_name].unique())

unique_sessions["Session"] = s
unique_sessions["Actions"] = a
unique_sessions["Time"] = t
unique_sessions["PageAverageTime"] = p


for site_key in all_sites:
    unique_sessions[site_key] = all_sites[site_key]

print(unique_sessions.head())
print(unique_sessions.count())

arff.dump(
    "sessions_60_min.arff",
    unique_sessions.values,
    relation="sessions_60_min",
    names=unique_sessions.columns,
)

CREATING ARFF SESSIONS
   Session  Actions  Time  PageAverageTime  /ksc.html  \
0        0        3   352       117.333333       True   
1        1        4   292        73.000000       True   
2        2        4   909       227.250000       True   
3        3        2    82        41.000000       True   
4        4        3   119        39.666667       True   

   /shuttle/missions/missions.html  /shuttle/resources/orbiters/columbia.html  \
0                            False                                      False   
1                            False                                      False   
2                            False                                      False   
3                            False                                      False   
4                            False                                      False   

   /shuttle/missions/sts-69/mission-sts-69.html  /shuttle/countdown/  \
0                                         False                False   
1  

In [22]:
print("CREATING ARFF HOST")
chosen_pages = data.url.unique()

users = data.drop(
    [
        "session",
        "method",
        "time_stamp",
        "response",
        "time",
        "url",
        "bytes",
        "difference",
        "session_change",
    ],
    axis=1,
)

grouped_users = users.groupby(by=["host"])
unique_users = pd.DataFrame()
print(users.head())

h = []
sac = []
st = []
savg = []
all_sites_user = {}
for site in chosen_pages:
    all_sites_user[site] = []

for _, user_group in grouped_users:
    h.append(user_group["host"].unique()[0])
    savg.append(user_group["session_average_per_page"].unique()[0])
    for site_name in all_sites:
        all_sites_user[site_name] = any(group[site_name].unique())



CREATING ARFF HOST
              host  session_actions  session_time  session_average_per_page  \
index                                                                         
0      ***.novo.dk                3           352                117.333333   
6      ***.novo.dk                3           352                117.333333   
12     ***.novo.dk                3           352                117.333333   
13     ***.novo.dk                4           292                 73.000000   
18     ***.novo.dk                4           292                 73.000000   

       /ksc.html  /shuttle/missions/missions.html  \
index                                               
0           True                            False   
6          False                             True   
12         False                            False   
13         False                            False   
18         False                            False   

       /shuttle/resources/orbiters/columbia.html  \
ind

In [23]:
unique_users["Host"] = h
unique_users["AvgTimePerPage"] = savg
for site_key in all_sites_user:
    unique_users[site_key] = all_sites_user[site_key]

In [24]:
print(unique_users.head())
print(unique_users.count())

arff.dump(
    "users_60_min.arff",
    unique_users.values,
    relation="users_60_min",
    names=unique_users.columns,
)

                              Host  AvgTimePerPage  /ksc.html  \
0                      ***.novo.dk      117.333333       True   
1                   007.thegap.com      227.250000       True   
2  01-dynamic-c.wokingham.luna.net       26.000000       True   
3  02-dynamic-c.wokingham.luna.net       36.500000       True   
4  03-dynamic-c.wokingham.luna.net       46.000000       True   

   /shuttle/missions/missions.html  /shuttle/resources/orbiters/columbia.html  \
0                            False                                      False   
1                            False                                      False   
2                            False                                      False   
3                            False                                      False   
4                            False                                      False   

   /shuttle/missions/sts-69/mission-sts-69.html  /shuttle/countdown/  \
0                                         False   