In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.datasets import fetch_mldata

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [322]:
df = pd.DataFrame.from_csv("datasets/malicious_websites/dataset.csv", index_col=None)
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0


In [323]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1781 entries, 0 to 1780
Data columns (total 21 columns):
URL                          1781 non-null object
URL_LENGTH                   1781 non-null int64
NUMBER_SPECIAL_CHARACTERS    1781 non-null int64
CHARSET                      1781 non-null object
SERVER                       1780 non-null object
CONTENT_LENGTH               969 non-null float64
WHOIS_COUNTRY                1781 non-null object
WHOIS_STATEPRO               1781 non-null object
WHOIS_REGDATE                1781 non-null object
WHOIS_UPDATED_DATE           1781 non-null object
TCP_CONVERSATION_EXCHANGE    1781 non-null int64
DIST_REMOTE_TCP_PORT         1781 non-null int64
REMOTE_IPS                   1781 non-null int64
APP_BYTES                    1781 non-null int64
SOURCE_APP_PACKETS           1781 non-null int64
REMOTE_APP_PACKETS           1781 non-null int64
SOURCE_APP_BYTES             1781 non-null int64
REMOTE_APP_BYTES             1781 non-null int64
APP

In [324]:
df.fillna(method='backfill', inplace=True)

In [325]:
for col in df.columns:
    df[col] = pd.Categorical(df[col], categories=df[col].unique()).codes

In [548]:
X = df.drop(columns=['Type'])
y = df['Type']
X, y = X.values, y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=42, stratify=y, shuffle=True)

In [471]:
models = {'RandomForestClassifier' : 
          RandomForestClassifier(max_depth=5, max_features=10, n_estimators=10),
         'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=4, max_features=10),
         'ExtraTreeClassifier': ExtraTreeClassifier(max_depth=4, max_features=10)}

In [472]:
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(name, 'accuracy score:', accuracy_score(preds, y_test))

RandomForestClassifier accuracy score: 0.9495798319327731
DecisionTreeClassifier accuracy score: 0.9327731092436975
ExtraTreeClassifier accuracy score: 0.9019607843137255


In [473]:
mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-3, hidden_layer_sizes=(50), random_state=42)
mlp_clf.fit(X_train, y_train)
res = mlp_clf.predict(X_train)
print(accuracy_score(mlp_clf.predict(X_test), y_test))

0.8739495798319328


In [9]:
from sklearn.datasets import make_moons

In [20]:
X, y = make_moons(n_samples=2000, noise=0.2, shuffle=True, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=42, stratify=y, shuffle=True)

In [38]:
print(X)

[[ 1.64703896  0.3088347 ]
 [ 0.3510386   0.9650405 ]
 [ 0.49061588 -0.27061724]
 ...
 [-0.96701302  0.63271301]
 [ 0.80593336 -0.76011338]
 [ 0.22621047  0.07307511]]


In [37]:
print(X[:, 0])

[ 1.64703896  0.3510386   0.49061588 ... -0.96701302  0.80593336
  0.22621047]


In [71]:
import mlp_dt

In [82]:
dtc = mlp_dt.DecisionTreeClassifier(depth=5)

In [83]:
dtc.fit(X_train, y_train, threshold=0.5, split_ratio=0.95)

(<mlp_dt.DecisionTreeClassifier at 0x1a15583240>,
 <mlp_dt.DecisionTreeClassifier at 0x1a155832e8>)

In [84]:
#dt.print_tree()

In [85]:
print("Train accuracy:", accuracy_score(dtc.predict(X_train), y_train))
print("Test accuracy:", accuracy_score(dtc.predict(X_test), y_test))

Train accuracy: 0.9775
Test accuracy: 0.9625


In [86]:
import tensorflow as ts
from tensorflow.examples.tutorials.mnist import input_data

In [87]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
X_train = np.vstack([img.reshape(-1,) for img in mnist.train.images])
y_train = mnist.train.labels
X_test = np.vstack([img.reshape(-1,) for img in mnist.test.images])
y_test = mnist.test.labels

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [None]:
mlp_clf = MLPClassifier(solver='adam', alpha=1e-3, hidden_layer_sizes=(100), random_state=42)
mlp_clf.fit(X_train, y_train)

In [23]:
print("Test accuracy:", accuracy_score(mlp_clf.predict(X_test), y_test))

0.9781


In [32]:
dtc = mlp_dt.DecisionTreeClassifier(depth=1)

In [33]:
dtc.fit(X_train, y_train, threshold=0.5, split_ratio=0.5)

(<mlp_dt.DecisionTreeClassifier at 0x1c3c8c7e10>,
 <mlp_dt.DecisionTreeClassifier at 0x1c3ccdf048>)

In [34]:
print(accuracy_score(dtc.predict(X_test), y_test))

0.9538
