In [3]:
import pandas as pd 

#### Read raw dataset

In [4]:
df = pd.read_csv('./HiddenFraudulentURLs.csv', sep=';')

#### Remove url duplicates

In [5]:
df_dupliless = df.loc[~df.duplicated(subset=['url'])]

In [6]:
df_dupliless.shape

(139495, 8)

#### Remove redundant http(s) in the urls 

In [9]:
def remove_http(url):
    http = 'http://'
    https = 'https://'
    if http in url:
        url = url.replace(http, '')
    if https in url:
        url = url.replace(https, '')
    return url

In [10]:
df_dupliless['url'] = df_dupliless['url'].apply(lambda x: remove_http(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Sort url by length

In [None]:
df_dupliless.groupby(['contentType']).isHiddenFraudulent.sum().reset_index().sort_values(by='isHiddenFraudulent', ascending=False)

In [None]:
sorted(list(df_dupliless.loc[df_dupliless.isHiddenFraudulent].url), key=lambda x: len(x))

In [117]:
df_dupliless.to_csv('./dataset_no_dupli_no_http.csv', sep=';', index=False)

In [109]:
df_dupliless.loc[~df_dupliless.poweredBy.isnull()].groupby(['isHiddenFraudulent']).url.count().reset_index()

Unnamed: 0,isHiddenFraudulent,url
0,False,65461
1,True,3006


#### Treat serverType nan

In [12]:
df = pd.read_csv('./dataset_no_dupli_no_http.csv', sep=',')

In [19]:
df.serverType = df.serverType.fillna('Unknown')

In [27]:
df.serverType.isnull().unique()

array([False])

In [18]:
df.to_csv('./dataset_no_dupli_no_http_only_url.csv', sep=',', index=False)

In [15]:
df = df.drop(['compromissionType', 'contentLength','poweredBy','contentType','lastModified','serverType'], axis=1)

In [62]:
df_bru = pd.read_csv('./HiddenFraudulentURLs.csv', sep=';')

In [81]:
df_bru.loc[25918:25925]

Unnamed: 0,url,compromissionType,isHiddenFraudulent,contentLength,serverType,poweredBy,contentType,lastModified
25918,http://www.fridelab.com.br/noticias/transplant...,defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:14 GMT"
25919,"http://www.fridelab.com.br/component/option,co...",defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:15 GMT"
25920,http://www.fridelab.com.br/noticias/transplant...,defacement,False,0,Apache,,text/html; charset=iso-8859-1,
25921,http://www.fridelab.com.br/noticias/especialis...,defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:17 GMT"
25922,"http://www.fridelab.com.br/component/option,co...",defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:18 GMT"
25923,http://www.fridelab.com.br/noticias/especialis...,defacement,False,0,Apache,,text/html; charset=iso-8859-1,
25924,http://www.fridelab.com.br/noticias/antibiotic...,defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:19 GMT"
25925,"http://www.fridelab.com.br/component/option,co...",defacement,False,0,Apache,,text/html; charset=utf-8,"Wed, 30 Jan 2013 19:15:20 GMT"


In [84]:
df_bru.loc[25921].url

'http://www.fridelab.com.br/noticias/especialistas-alertam-para-aumento-global-de-diabetes-infantil/imprimir'

In [83]:
df_bru.loc[25922].url

'http://www.fridelab.com.br/component/option,com_mailto/link,aHR0cDovL3d3dy5mcmlkZWxhYi5jb20uYnIvbm90aWNpYXMvZXNwZWNpYWxpc3Rhcy1hbGVydGFtLXBhcmEtYXVtZW50by1nbG9iYWwtZGUtZGlhYmV0ZXMtaW5mYW50aWw=/tmpl,component/index.html'

### Create train, valid and test set

In [170]:
df = pd.read_csv('./dataset_no_dupli_no_http.csv', sep=',')

In [149]:
df_negative = df.loc[df.isHiddenFraudulent==False]

In [150]:
df_negative.shape

(130845, 8)

In [151]:
df_positive = df.loc[df.isHiddenFraudulent==True]

In [152]:
df_positive.shape

(8650, 8)

In [153]:
df_positive_rep = pd.concat([df_positive for i in range(15)])

In [154]:
df_positive_rep.shape

(129750, 8)

In [155]:
df_full = pd.concat([df_negative, df_positive_rep])

In [177]:
df_full = df_full.sample(frac=1).reset_index(drop=True)

In [179]:
np.random.seed(seed=1) #makes split reproducible
rand = np.random.rand(len(df_full))
msk_Tr = rand < 0.7
msk_test = (0.7 <= rand) & (rand < 0.85)
msk_V = rand >= 0.85
train_df = df_full[msk_Tr]
eval_df = df_full[msk_V]
test_df = df_full[msk_test]

In [159]:
train_df.shape

(182332, 8)

In [120]:
eval_df.shape

(38967, 8)

In [121]:
test_df.shape

(39296, 8)

In [166]:
eval_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    19755
True     19212
Name: url, dtype: int64

In [167]:
eval_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    19755
True     19212
Name: url, dtype: int64

In [168]:
train_df.groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False    91513
True     90819
Name: url, dtype: int64

In [180]:
train_df.to_csv('./train.csv', sep=',', index=False)
eval_df.to_csv('./eval.csv', sep=',', index=False)
test_df.to_csv('./test.csv', sep=',', index=False)

In [102]:
df_dupliless.loc[df_dupliless.url.str.contains('.txt')].groupby(['isHiddenFraudulent']).url.count()

isHiddenFraudulent
False      6
True     668
Name: url, dtype: int64