# Defense framework against adversarial attack

In [2]:
import os
import os.path
import pandas as pd
import numpy as np
import random
import scipy
from scipy.sparse import vstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Load datasets

In [81]:
# Reading data from csv file
data = pd.read_csv("FinalDataIX.csv")
data.tail()

Unnamed: 0,url,Label
99995,http://allgxrltogaparty.co.uk/gallery2/maxn.ph...,4
99996,http://3401.e-prxntphoto.co.uk/thxsxsessex/xnd...,4
99997,http://acard4u.co.uk/product_xnfo.php?cPath=10...,4
99998,http://aboutscotland.co.uk/ecosse/portessxe/xn...,4
99999,http://allxanceleague.co.uk/cgx-bxn/phpBB2/vxe...,4


Two colums are in the dataset. First column contains all urls and the second column is the label of these urls: 0 represents benign URLs, 1 represents defacement sites URLs, 2 represents malware URLs, 3 represents phishing URLs and 4 are spam URLs.

The first half of datasets are inital URL datasets. The remaining half are datasets after adversarial attack.

In [82]:
# Labels
y = data["Label"]

# Features
url_list = data["url"]

url_list.head()

0    http://1337x.to/torrent/1048648/American-Snipe...
1    http://1337x.to/torrent/1110018/Blackhat-2015-...
2    http://1337x.to/torrent/1122940/Blackhat-2015-...
3    http://1337x.to/torrent/1124395/Fast-and-Furio...
4    http://1337x.to/torrent/1145504/Avengers-Age-o...
Name: url, dtype: object

TF-IDF is used to tokenize our URLs. it is used by detecting the frequency of a specific word in the artcle. By applying TF-IDF to our URLs, we can extract information and feed the data we get into our model.

In [83]:
# Using Tokenizer
vectorizer = TfidfVectorizer()

# Store vectors into X variable as Our XFeatures
X = vectorizer.fit_transform(url_list)

We need to have the first half of our dataset to train our initial model and verify its correctness.So the first 50000 URLs and its labels are stored in X_initial and y_initial separately.

In [84]:
X_initial = X[0]
for i in range(49999):
    ele = X[i + 1]
    X_initial = vstack((X_initial, ele))

In [85]:
y_initial = []
for i in range(50000):
    y_initial.append(y[i])

Split test and train datasets.

In [86]:
# Split into training and testing dataset 80:20 ratio
X_initial_train, X_initial_test, y_initial_train, y_initial_test = train_test_split(X_initial, y_initial, test_size=0.1, random_state=42)

Train the initial model.

In [87]:
# Model Building using logistic regression
logit_initial = LogisticRegression()
logit_initial.fit(X_initial_train, y_initial_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Test the accuracy of initial model.

In [88]:
# Accuracy of Our Model
print("Accuracy of our model is: ",logit_initial.score(X_initial_test, y_initial_test))

Accuracy of our model is:  0.99


Get the dataset after applying adversarial attack and store its url in X_attack.

In [89]:
X_attack = X[50000]
for i in range(50000,99999):
    ele = X[i + 1]
    X_attack = vstack((X_attack, ele))

In [90]:
y_attack = y_initial

Show the accuracy of our model after attack. The confidence should be less than 90% and otherwise it is not a successful attack.

In [91]:
print("Accuracy of our model is: ",logit_initial.score(X_attack, y_attack))

Accuracy of our model is:  0.9127


## Reverse URL Method

Defines the method to reverse the URL.

In [92]:
def reverseString(s):
 if s == "":
     return s
 else:
     return reverseString(s[1:])+s[0]

Combins the reversed url with initial one.

In [93]:
url_list_reverse = []
for i in range(len(url_list)):
    url_list_reverse.append(url_list[i] + reverseString(url_list[i]))

Tokenize URLs, extract attacking datasets.

In [94]:
# Using Tokenizer
vectorizer = TfidfVectorizer()

# Store vectors into X variable as Our XFeatures
X_reverse = vectorizer.fit_transform(url_list_reverse)

In [95]:
X_reverse_ini = X_reverse[0]
for i in range(49999):
    ele = X_reverse[i + 1]
    X_reverse_ini = vstack((X_reverse_ini, ele))

In [100]:
y_reverse_ini = y_initial

In [97]:
X_reverse_attack = X_reverse[50000]
for i in range(50000,99999):
    ele = X_reverse[i + 1]
    X_reverse_attack = vstack((X_reverse_attack, ele))

In [101]:
y_reverse_attack = y_initial

Train the model.

In [103]:
# Model Building using logistic regression
logit_reverse = LogisticRegression()
logit_reverse.fit(X_reverse_ini, y_reverse_ini)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Show the accuracy under weak defense.

In [104]:
print("Accuracy of our model is: ",logit_reverse.score(X_reverse_attack, y_reverse_attack))

Accuracy of our model is:  0.93936


## Information extraction

Extract key information and trim irrelevant information.

In [105]:
url_list_key = []
for i in range(len(url_list)):
    url_list_key.append(url_list[i].replace("http://",'').replace("https://",''))

Tokenize the dataset

In [106]:
# Using Tokenizer
vectorizer = TfidfVectorizer()

# Store vectors into X variable as Our XFeatures
X_key = vectorizer.fit_transform(url_list_key)

In [107]:
X_key_ini = X_key[0]
for i in range(49999):
    ele = X_key[i + 1]
    X_key_ini = vstack((X_key_ini, ele))

In [108]:
y_key_ini = y_initial

In [109]:
X_key_attack = X_key[50000]
for i in range(50000,99999):
    ele = X_key[i + 1]
    X_key_attack = vstack((X_key_attack, ele))

In [110]:
y_key_attack = y_initial

Train the model.

In [112]:
# Model Building using logistic regression
logit_key = LogisticRegression()
logit_key.fit(X_key_ini, y_key_ini)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Show accuracy after extracting key information.

In [114]:
print("Accuracy of our model is: ",logit_key.score(X_key_attack, y_key_attack))

Accuracy of our model is:  0.91092


## Assemble weak defenses

In [93]:
print("InfoEx",logit.predict(X_key_a[100000]))
      
print("Reverse",logit.predict(X_key_a[100000]))

InfoEx [1]
Reverse [1]
