In [156]:
import numpy as np
import pandas as pd 

#### Read raw dataset

In [157]:
df = pd.read_csv('./HiddenFraudulentURLs.csv', sep=';')

#### Remove url duplicates

In [158]:
df = df.loc[~df.duplicated(subset=['url'])]

In [159]:
df.shape

(139495, 8)

#### Remove redundant http

In [64]:
def remove_http(url):
    http = 'http://'
    https = 'https://'
    if http in url:
        url = url.replace(http, '')
    if https in url:
        url = url.replace(https, '')
    return url

In [65]:
df['url'] = df['url'].apply(lambda x: remove_http(x))

#### Treat nan values

In [160]:
df = df.fillna('Unknown')

#### write csv

In [161]:
df.to_csv('./dataset_no_dupli.csv', sep=',', index=False)

### Create train, valid and test set

In [200]:
df = pd.read_csv('./dataset_no_dupli.csv', sep=',')

In [201]:
df_negative = df.loc[df.isHiddenFraudulent==False]

In [202]:
df_negative.shape

(130845, 8)

In [203]:
df_positive = df.loc[df.isHiddenFraudulent==True]

In [204]:
df_positive.shape

(8650, 8)

In [205]:
np.random.seed(seed=2) #makes split reproducible
rand = np.random.rand(len(df))
msk_tr = rand < 0.7
msk_ev = (0.7 <= rand) & (rand < 0.85)
msk_ts = rand >= 0.85
train_df = df[msk_tr]
eval_df = df[msk_ev]
test_df = df[msk_ts]

In [206]:
def show_info(df):
    size_df = df.shape[0]
    nb_fraud = df.loc[df.isHiddenFraudulent==True].shape[0]
    print("Size of dataset : {} with {} True examples".format(size_df, nb_fraud))

In [207]:
show_info(train_df)
show_info(eval_df)
show_info(test_df)

Size of dataset : 97902 with 6084 True examples
Size of dataset : 20627 with 1262 True examples
Size of dataset : 20966 with 1304 True examples


In [208]:
train_df_neg = train_df.loc[train_df.isHiddenFraudulent==False]
train_df_pos = train_df.loc[train_df.isHiddenFraudulent==True]

In [209]:
train_df_neg.shape[0]

91818

In [210]:
train_df_pos.shape[0]

6084

In [211]:
nb_repeat = train_df_neg.shape[0]//train_df_pos.shape[0]*2/3
nb_repeat

10.0

In [212]:
nb_repeat = 10
train_df_pos_rep = pd.concat([train_df_pos for i in range(nb_repeat)])
train_df = pd.concat([train_df_neg, train_df_pos_rep])

In [213]:
train_df.shape

(152658, 8)

In [214]:
#Shuffling
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [215]:
train_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    91818
True     60840
Name: url, dtype: int64

In [216]:
eval_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    19365
True      1262
Name: url, dtype: int64

In [217]:
test_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    19662
True      1304
Name: url, dtype: int64

### Write datasets

In [219]:
train_df.to_csv('./train.csv', sep=',', index=False)
eval_df.to_csv('./eval.csv', sep=',', index=False)
test_df.to_csv('./test.csv', sep=',', index=False)

### Write datasets for RNN (only url)