# Libraries

In [32]:
import pandas as pd
import numpy as np
import datetime
import pandas as pd
import  numpy as np
from urllib.parse import urlparse
import httpagentparser

pd.set_option('display.width', 400)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 2000)

import matplotlib.pyplot as plt
#pd.options.mode.chained_assignment = None  # default='warn'


## Preprocessing tools and removing useless logs

In [33]:
def preprocessing(df):
    df.replace("-", "",inplace = True)
    df = df[pd.to_numeric(df['cs-bytes'], errors='coerce').notnull()]
    df = df[pd.to_numeric(df['sc-bytes'], errors='coerce').notnull()]
    #df["cs(Referer)"] = df["cs(Referer)"].to_string()
    return df

def remove_rows(data):
    data= data.loc[(data['x-exception-id']=="")]
    data.loc[data['cs-host']=="",'empty']=1
    data['empty']=data['empty'].fillna(0)
    col_name_skip = ['referer-protocol','referer-domain',"referer-path","cs-host"]

    for col in data.columns:
        if col in col_name_skip:
            pass
        elif col=='cs-uri-path':
            data.loc[data[col]=="/","empty"]+=1

        else:
            data.loc[data[col]=="","empty"]+=1
    data=data.loc[data['empty']<6]
    data=data.drop(columns=['empty','x-exception-id'])
    return data




## functions

In [34]:
def strange_status(a):
    #a['sc-status'] = a['sc-status'].astype(float)
    a['sc-status'] = pd.to_numeric(a['sc-status'],errors='coerce')
    a['strange_status'] = np.where(a['sc-status'] < 399, 0, 1)
    return a

def strange_port(data,tab_port,tab_method):
    data['strange_port'] = np.where(data['cs-uri-port'].isin(tab_port), 0, 1)
    data.loc[(data['cs-method'].isin(tab_method)) & (data['strange_port']==1),'strange_port' ] +=1
    
    return data



In [35]:
def parse_url(data, column_toparse, scheme=True, domain=True, path=True, params=False, query=False, fragment=False):
    if column_toparse not in data.columns:
        print("Column name not found")
        return 0
    else:
        data['referer-protocol'], data['referer-domain'], data['referer-path'], data['referer-params'], data['referer-query'], data['referer-fragment'] = zip(*data[column_toparse].map(urlparse))
        choices = [scheme, domain, path, params, query, fragment]
        names = ['referer-protocol', 'referer-domain', 'referer-path', 'referer-params', 'referer-query','referer-fragment']
        indexes = np.where(choices)[0]
        keep = []
        for k in indexes:
            keep.append(names[k])
            dropped = list(set(names) - set(keep))
        data = data.drop(columns=dropped)
    return data
def find_same_referer(data):
    data.loc[data['cs-host'] != data['referer-domain'], 'changed'] = 1
    data['changed'] = data['changed'].fillna(0)
    return data



In [42]:
def compute_url_size(df,poids):
    sum_column = df["cs-host"].apply(lambda x: len(str(x))) + df["cs-uri-path"].apply(lambda x: len(str(x)))
    df['url_size'] = sum_column
    threshold1=df["url_size"].quantile(0.9)
    threshold2=df["url_size"].quantile(0.99)
    df["url_size"] = df["url_size"].apply(lambda x :max(0,min(poids,poids*((x-threshold1)/(threshold2-threshold1)))))
    return df



def is_big_cs_bytes(df, quantile = 0.9):
    df["bigcs"] = df["cs-bytes"].apply(lambda x: int(x))
    value = df["bigcs"].quantile(quantile)
    df["bigcs"] = df["bigcs"].apply(lambda x: x >= value)
    return df



def extension_superior_than(df, size, quantile, tab_extension):
    value = df["cs-uri-extension-frequency"].quantile(quantile)

    df["extension_strange"] = np.where(df["cs-uri-extension-frequency"] > value, 0, 1)
    df['len-extension']=df['cs-uri-extension'].apply(lambda x  : len(str(x)))
    df.loc[df['len-extension']>size,"extension_strange"]=0.5
    df.loc[df['cs-uri-extension'].isin(tab_extension),"extension_strange"]=1
    df["extension_strange"]= df['extension_strange']*2
    return df

def amount_people_by(df, columnname):
    columnname_by = str("people-by")+str(columnname)
    df[columnname_by] =  df.groupby([columnname]).nunique()

def add_amount_people_by(df, columnname,quantile=0.9):
    columnname_by = str("people_by")+str(columnname)
    df2 = df.drop_duplicates(subset=[columnname,"c-ip"])
    df_frequency = df2.groupby([columnname]).count()
    most_frequent = df_frequency.index.tolist()
    quantity = df_frequency.iloc[:,0].tolist()

    frequent_host_dict = dict(zip(most_frequent, quantity))

    df[columnname_by] = df[columnname].apply(lambda x: frequent_host_dict.get(x))
    value = df[columnname_by].quantile(quantile)
    df[columnname_by] = df[columnname_by].apply(lambda x: x >= value)

    return df

def add_frequency(df, columnname):
    columnnamefreq = str(columnname)+str("-frequency")
    df_frequency = df.groupby([columnname]).count()
    most_frequent = df_frequency.index.tolist()
    quantity = df_frequency.iloc[:,0].tolist()
    frequent_host_dict = dict(zip(most_frequent, quantity))

    df[columnnamefreq] = df[columnname].apply(lambda x: frequent_host_dict.get(x))
    #print(df.head())
    return df



In [47]:
def compute_useragent(data,poids):
    data['more_info']=data['cs(User-Agent)'].apply(lambda x : httpagentparser.simple_detect(x))
    data['fishy_os']=data['more_info'].apply(lambda x :x.__contains__('Unknown OS'))
    data['fishy_browser']=data['more_info'].apply(lambda x :x.__contains__('Unknown Browser'))
    data['fishy_os']=(data['fishy_os'].astype(int))*poids
    data['fishy_browser']=(data['fishy_browser'].astype(int))*poids
    return data

## Loading data

In [38]:
headerString = 'date time time-taken c-ip cs-username cs-auth-group x-exception-id sc-filter-result cs-categories cs(Referer) sc-status s-action cs-method rs(Content-Type) cs-uri-scheme cs-host cs-uri-port cs-uri-path cs-uri-query cs-uri-extension cs(User-Agent) s-ip sc-bytes cs-bytes x-virus-id'
header = headerString.split(" ")
data = pd.read_csv('./Data/message.txt', delimiter="\s+", index_col=False, encoding="utf-8", comment= "#", names=header)

data=data.drop(columns=['cs-username','cs-auth-group','x-virus-id','time-taken','date','time'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [39]:
tab_port = [80,443,"80","443"]
tab_method = ["GET","POST","HEAD","OPTIONS","PUT","CONNECT"]


data = preprocessing(data)
data=remove_rows(data)

data=parse_url(data,'cs(Referer)')
data=strange_port(data,tab_port,tab_method)

In [43]:
data = find_same_referer(data)
data = is_big_cs_bytes(data)
data = is_big_cs_bytes(data)
data = strange_status(data)
data=compute_url_size(data,2)


In [44]:
data = add_amount_people_by(data,"cs-host")
data = add_frequency(data, "cs-uri-extension")
tab_extension = ["dll","zip","rar","bin","exe"]
data = extension_superior_than(data, 20,0.004,tab_extension)

In [None]:
data=compute_useragent(data,2)

## GETTING SCORE

In [None]:


score_cols = ['url_size', 'bigcs', 'strange_status',  'strange_port', 'people_bycs-host', 'changed','extension_strange','fishy_os','fishy_browser']
data["sum"] = data[score_cols].sum(axis=1)
data = data.sort_values(by=["sum"],ascending=False)
data.reset_index()

## USER AGENT

## OLD VERSION

In [None]:
#### OLD PART ####
def make_dummies(df, string):
    df = pd.concat([df, pd.get_dummies(df[string], prefix=string)], axis=1)
    df.drop([string], axis=1, inplace=True)
    return df

def compute_similarity(string1, stand_val):
    indexes = np.where(string1 == stand_val)
    return indexes[0]

def apply_sim(data, column_name, one_line_df, new_col_name):
    if column_name not in data.columns:
        print("Wrong column name")
        return 0
    else:
        list_indexes = compute_similarity(data[column_name].values, one_line_df[column_name].values[0])
        data[new_col_name] = data.index.isin(list_indexes)
        data[new_col_name] = data[new_col_name].astype(int)
        return data

def compute_all_sims(data, y):
    all_cols = ['x-exception-id', 'cs-categories',
              'sc-status', 's-action', 'cs-uri-scheme', 'cs-uri-port']
    for col in all_cols:
        apply_sim(data, col, y, 'sim')
        if 'final_sim' not in data.columns:
            data['final_sim'] = data['sim']
        else:
            data['final_sim'] = data['final_sim'] + data['sim']


### adding the columns ###
'''
y=a.tail(1)
a=make_dummies(a,'sc-filter-result')
a = add_url_size(a)
a = add_frequency(a,'cs-host')
compute_all_sims(a,y)
'''

def normalization_zero_one(df, list_column):
    for column in list_column:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

def normalization_gauss(df, list_column):
    for column in list_column:
        df[column] = (df[column] - df[column].mean()) / df[column].std()
    return df