In [None]:
import pandas as pd
import numpy as np
import datetime
import socket
import struct
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'


In [None]:
headerString = "date time time-taken c-ip cs-username cs-auth-group x-exception-id sc-filter-result cs-categories cs(Referer) sc-status s-action cs-method rs(Content-Type) cs-uri-scheme cs-host cs-uri-port cs-uri-path cs-uri-query cs-uri-extension cs(User-Agent) s-ip sc-bytes cs-bytes x-virus-id"
header=headerString.split(" ")
raw_data = pd.read_csv('./Data/message.txt', delimiter="\s+", index_col=False, encoding="utf-8", comment='#',names=header)


In [3]:
#drop columns : cs-unsername cs-auth-group  (date prob)
df1 = raw_data.sample(frac=0.5,random_state=200) 
df2 =df1.sample(frac=0.5,random_state=200)

df1=df1.drop(columns=['cs-username','cs-auth-group','date',
                      'time-taken','cs-method',
                      'rs(Content-Type)','cs-uri-path',
                      'cs-uri-query','cs-uri-extension','cs(User-Agent)','s-ip','x-virus-id'])


In [4]:

def preprocessing(data):
    data=data.dropna()
    data.loc[data['x-exception-id'] == '-', 'x-exception-id'] = ""

    return data


In [5]:
def compute_similarity(string1,stand_val):
    indexes=np.where(string1==stand_val)
    return indexes[0]


def apply_sim(data,column_name,one_line_df,new_col_name):
    if column_name not in data.columns:
        print("Wrong column name")
        return 0
    else :
        list_indexes=compute_similarity(data[column_name].values,one_line_df[column_name].values[0])
        data[new_col_name]=data.index.isin(list_indexes)
        data[new_col_name]=data[new_col_name].astype(int)
    return data


In [6]:
def compute_all_sims(data,y):
    all_cols=['cs-categories',
 'sc-status','s-action','cs-uri-scheme','cs-uri-port']
    all_cols
    for col in all_cols:
        apply_sim(data,col,y,'sim')
        if 'final_sim' not in data.columns :
            data['final_sim'] = data['sim']
        else :
            data['final_sim']= data['final_sim'] + data['sim']
    return data


In [7]:
def add_urlsize(data):
    data["url_size"] = data["cs(Referer)"].apply(lambda x: len(str(x)))
    return data

def add_hostfreq(data):
    cs_host_frequency = data.groupby(['cs-host']).count().sort_values(by=["url_size"],ascending=False)
    most_frequent = cs_host_frequency.index.tolist()
    quantity = cs_host_frequency['s-action'].tolist()
    frequent_host_dict = dict(zip(most_frequent, quantity))
    data["cs-host-frequency"] = data["cs-host"].apply(lambda x: frequent_host_dict.get(x))
    return data






In [33]:
def normalization(data):
    df4=data.copy()
    #df4['final_sim']=(df4['final_sim']-df4['final_sim'].mean())/df4['final_sim'].std()
    #df4 = df4.drop([283551], axis=0)
    df4['sc-bytes'] = pd.to_numeric(df4['sc-bytes'])
    df4['cs-bytes'] = pd.to_numeric(df4['cs-bytes'])
    df4['sc-bytes'] = pd.to_numeric(df4['sc-bytes'], errors='coerce').notnull()
    df4['cs-bytes'] = pd.to_numeric(df4['cs-bytes'],errors='coerce').notnull()
    df4['url_size'] =(df4['url_size']-df4['url_size'].mean())/df4['url_size'].std()
    df4['cs-host-frequency'] = (df4['cs-host-frequency']-df4['cs-host-frequency'].mean())/df4['cs-host-frequency'].std()
    return df4


def normalization2(data):
    df4=data.copy()
    df4['final_sim']=(df4['final_sim']-df4['final_sim'].min())/(df4['final_sim'].max() -df4['final_sim'].min()) 
    df4['cs-bytes'].apply(lambda x: isinstance(x, (int, float)))
    df4['sc-bytes'].apply(lambda x: isinstance(x, (int, float)))

    df4['sc-bytes']=(df4['sc-bytes']-df4['sc-bytes'].min())/(df4['sc-bytes'].max() -df4['sc-bytes'].min()) 
    df4['cs-bytes']=(df4['cs-bytes']-df4['cs-bytes'].min())/(df4['cs-bytes'].max() -df4['cs-bytes'].min()) 
    return df4



def make_dummies(data, string):
    data= pd.concat([data,pd.get_dummies(data[string], prefix=string)],axis=1)
    data.drop([string], axis=1, inplace=True)
    return data



In [34]:

y=df1.tail(1)
y
df3=df1.sample(frac=0.4,random_state=2000)
df3=df3.reset_index(drop=True)
print(len(df3))
df3=preprocessing(df3)
print(len(df3))
df3=make_dummies(df3,'sc-filter-result')
df3=make_dummies(df3,'x-exception-id')
df3

208366
207774


Unnamed: 0,time,c-ip,cs-categories,cs(Referer),sc-status,s-action,cs-uri-scheme,cs-host,cs-uri-port,sc-bytes,...,sc-filter-result_OBSERVED,sc-filter-result_PROXIED,x-exception-id_,x-exception-id_dns_server_failure,x-exception-id_dns_unresolved_hostname,x-exception-id_internal_error,x-exception-id_invalid_request,x-exception-id_policy_denied,x-exception-id_tcp_error,x-exception-id_unsupported_protocol
0,21:17:48,77323a5e082ea451,unavailable,http://www.mygaypalace.com/pics.html,200,TCP_MISS,http,thumbs.mygaypalace.com,80,20957,...,1,0,1,0,0,0,0,0,0,0
1,20:43:03,ce4f02e53b9760d5,unavailable,http://static.xvideos.com/swf/xv-player.swf,400,TCP_NC_MISS,http,porn771.xvideos.com,80,221,...,1,0,1,0,0,0,0,0,0,0
2,20:48:19,3f78ab0fdf2a1aaf,unavailable,-,0,TCP_ERR_MISS,http,208.73.210.29,80,0,...,1,0,1,0,0,0,0,0,0,0
3,21:23:00,dd60f0927674ae4a,unavailable,http://www.01-sy.com/administration/ProductDet...,200,TCP_NC_MISS,http,www.01-sy.com,80,914,...,1,0,1,0,0,0,0,0,0,0
4,20:35:51,ee73ceddd52050f0,unavailable,http://www.xhamster.com/movies/101714/swaedyas...,200,TCP_HIT,http,static.xhamster.com,80,1115,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208361,20:57:13,b96e84cbed2e51f2,unavailable,http://www.phonesreview.co.uk/2011/02/01/3dboa...,200,TCP_HIT,http,static.phonesreview.co.uk,80,1037,...,1,0,1,0,0,0,0,0,0,0
208362,20:38:16,2056caa323b6174a,unavailable,http://porn322.xvideos.com/videos/flv/7/f/5/,400,TCP_NC_MISS,http,porn322.xvideos.com,80,221,...,1,0,1,0,0,0,0,0,0,0
208363,20:54:04,6d98469a3f1de6f4,unavailable,http://www.anakbnet.com/video/file.php?f=321,200,TCP_HIT,http,www.anakbnet.com,80,23384,...,1,0,1,0,0,0,0,0,0,0
208364,20:51:32,830cfe77c2e3546b,unavailable;unavailable,-,0,TCP_ERR_MISS,-,-,0,0,...,0,0,0,0,0,1,0,0,0,0


In [32]:
#compute_all_sims(df3,y)
df3=add_urlsize(df3)
df3=add_hostfreq(df3)
dfn=normalization(df3)
dfn1=dfn.sample(frac=0.1,random_state=1337)


In [11]:
from sklearn.cluster import DBSCAN

#dfn=dfn.dropna()
clusters = DBSCAN(eps = 0.7, min_samples=100)
clusters.fit(dfn1[['cs-bytes','sc-bytes','url_size','cs-host-frequency','sc-filter-result_DENIED','sc-filter-result_OBSERVED','sc-filter-result_PROXIED','x-exception-id_','x-exception-id_dns_server_failure','x-exception-id_dns_unresolved_hostname','x-exception-id_internal_error','x-exception-id_invalid_request','x-exception-id_policy_denied','x-exception-id_tcp_error','x-exception-id_unsupported_protocol']])
dfn1['labels'] = [str(e) for e in clusters.labels_]

dfn1

Unnamed: 0,time,c-ip,cs-categories,cs(Referer),sc-status,s-action,cs-uri-scheme,cs-host,cs-uri-port,sc-bytes,...,x-exception-id_dns_server_failure,x-exception-id_dns_unresolved_hostname,x-exception-id_internal_error,x-exception-id_invalid_request,x-exception-id_policy_denied,x-exception-id_tcp_error,x-exception-id_unsupported_protocol,url_size,cs-host-frequency,labels
127261,20:35:27,e3773eab4436f15c,unavailable,http://tag.admeld.com/ad/iframe/70/ugo/728x90/...,200,TCP_NC_MISS,http,adserving.cpxinteractive.com,80,-0.037168,...,0,0,0,0,0,0,0,4.861212,0.150530,-1
207252,20:56:42,1ed5b33c56949d77,unavailable,http://forum.jsoftj.com/t6078.html,200,TCP_HIT,http,img66.imageshack.us,80,-0.032412,...,0,0,0,0,0,0,0,-0.256696,-0.676385,0
62689,21:13:23,0e8fe56c260eb807,unavailable,http://sn109w.snt109.mail.live.com/default.asp...,200,TCP_HIT,http,js.wlxrs.com,80,0.055723,...,0,0,0,0,0,0,0,0.139641,-0.676385,0
49644,21:06:37,dbe03e4dd0527450,unavailable,http://translate.google.com/?hl=ar&tab=wT,204,TCP_NC_MISS,http,csi.gstatic.com,80,-0.037853,...,0,0,0,0,0,0,0,-0.136072,-0.599147,0
110529,20:43:50,c1230109a1b3dc53,unavailable,http://www.pichunter.com/,200,TCP_NC_MISS,http,www.pichunter.com,80,-0.037802,...,0,0,0,0,0,0,0,-0.411784,-0.664720,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91243,20:55:16,6e6c5d8f2ef009cd,unavailable,http://www.momsxboys.com/,200,TCP_HIT,http,momsxboys.csiporn.com,80,-0.021583,...,0,0,0,0,0,0,0,-0.411784,-0.640446,0
37163,21:16:34,671296c79085992c,unavailable,http://x36.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,http,x36.iloveim.com,80,-0.037821,...,0,0,0,0,0,0,0,-0.032679,-0.362075,0
188374,21:24:41,ba56445656234351,unavailable,http://www.artonline.tv/artrss/ARTSportRss.aspx,304,TCP_HIT,http,www.artonline.tv,80,-0.037825,...,0,0,0,0,0,0,0,-0.032679,-0.647697,0
175560,21:00:00,0e8fe56c260eb807,unavailable,http://www.google.com/search?hl=ar&gbv=2&tbm=i...,200,TCP_MISS,http,t0.gstatic.com,80,-0.032989,...,0,0,0,0,0,0,0,1.449274,-0.614279,0


In [12]:
dfn1.groupby('labels').count()

Unnamed: 0_level_0,time,c-ip,cs-categories,cs(Referer),sc-status,s-action,cs-uri-scheme,cs-host,cs-uri-port,sc-bytes,...,x-exception-id_,x-exception-id_dns_server_failure,x-exception-id_dns_unresolved_hostname,x-exception-id_internal_error,x-exception-id_invalid_request,x-exception-id_policy_denied,x-exception-id_tcp_error,x-exception-id_unsupported_protocol,url_size,cs-host-frequency
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,481,481,481,481,481,481,481,481,481,481,...,481,481,481,481,481,481,481,481,481,481
0,17421,17421,17421,17421,17421,17421,17421,17421,17421,17421,...,17421,17421,17421,17421,17421,17421,17421,17421,17421,17421
1,1800,1800,1800,1800,1800,1800,1800,1800,1800,1800,...,1800,1800,1800,1800,1800,1800,1800,1800,1800,1800
2,819,819,819,819,819,819,819,819,819,819,...,819,819,819,819,819,819,819,819,819,819
3,123,123,123,123,123,123,123,123,123,123,...,123,123,123,123,123,123,123,123,123,123
4,133,133,133,133,133,133,133,133,133,133,...,133,133,133,133,133,133,133,133,133,133


In [18]:
dfn1.loc[dfn1['labels']=="1"][['cs-bytes','sc-bytes','labels',
                               'url_size','cs-host-frequency',
                               'sc-filter-result_DENIED','sc-filter-result_OBSERVED',
                               'sc-filter-result_PROXIED','x-exception-id_',
                               'x-exception-id_dns_server_failure','x-exception-id_dns_unresolved_hostname',
                               'x-exception-id_internal_error','x-exception-id_invalid_request',
                               'x-exception-id_policy_denied','x-exception-id_tcp_error',
                               'x-exception-id_unsupported_protocol']]

Unnamed: 0,cs-bytes,sc-bytes,labels,url_size,cs-host-frequency,sc-filter-result_DENIED,sc-filter-result_OBSERVED,sc-filter-result_PROXIED,x-exception-id_,x-exception-id_dns_server_failure,x-exception-id_dns_unresolved_hostname,x-exception-id_internal_error,x-exception-id_invalid_request,x-exception-id_policy_denied,x-exception-id_tcp_error,x-exception-id_unsupported_protocol
207252,-0.025716,-0.032412,0,-0.256696,-0.676385,0,1,0,1,0,0,0,0,0,0,0
62689,-0.022999,0.055723,0,0.139641,-0.676385,0,1,0,1,0,0,0,0,0,0,0
49644,-0.027427,-0.037853,0,-0.136072,-0.599147,0,1,0,1,0,0,0,0,0,0,0
110529,0.003722,-0.037802,0,-0.411784,-0.664720,0,1,0,1,0,0,0,0,0,0,0
206880,-0.025867,-0.037868,0,-0.084376,1.132234,0,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91243,-0.025967,-0.021583,0,-0.411784,-0.640446,0,1,0,1,0,0,0,0,0,0,0
37163,0.011169,-0.037821,0,-0.032679,-0.362075,0,1,0,1,0,0,0,0,0,0,0
188374,-0.015249,-0.037825,0,-0.032679,-0.647697,0,1,0,1,0,0,0,0,0,0,0
175560,-0.018067,-0.032989,0,1.449274,-0.614279,0,1,0,1,0,0,0,0,0,0,0


In [None]:

dfn2=dfn1.loc[dfn1['cs-bytes']<1]
plt.scatter(dfn2['final_sim'],dfn2['cs-bytes'])

In [None]:
dfn2=dfn2.loc[dfn2['sc-bytes']<1]

plt.scatter(dfn2['final_sim'],dfn2['sc-bytes'])