# Data mining

Main notebook for the data mining assignment.

The first dataset used as a test in the buildign process is a reduced version of [Bitcoin Heist Ransomware](https://archive.ics.uci.edu/ml/datasets/BitcoinHeistRansomwareAddressDataset#) with the first 10k lines.

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import numpy as np
import collections
import random
# Visualization
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix



### Read file as dataset

In [123]:
INPUT_FILE = '/datasets/Nursery/nursery.csv'

In [124]:
col_names = [
    'parents',
    'has_nurs',
    'form',
    'children',
    'housing',
    'finance',
    'social',
    'health',
    'classification'
]
feature_names = col_names[:-1]

In [125]:
dataset = pd.read_csv(INPUT_FILE, names=col_names)
LENGTH = len(dataset)

In [126]:
dataset.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,classification
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [127]:
oh_feature_names = pd.get_dummies(dataset[feature_names]).columns.values

### Build a sparse matrix of one-hot encoded dataset

In [128]:
encoder = preprocessing.OneHotEncoder()
encoder.fit(dataset[feature_names])
FEATURES = encoder.transform(dataset[feature_names]).toarray()

In [129]:
label_encoder = preprocessing.LabelEncoder().fit(dataset['classification'])

In [130]:
LABELS = label_encoder.transform(dataset['classification'])

In [131]:
LABELS[1:100]

array([1, 0, 2, 1, 0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1, 0, 4, 1, 0, 4, 1,
       0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1, 0,
       4, 1, 0, 4, 1, 0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1, 0, 4, 1, 0, 4,
       1, 0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1, 0, 4, 1, 0, 4, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1])

In [132]:
# Convert the Positive values 
# 'not_recom' => false
# everything else =>true
LABELS = np.array([0 if x == 1 else 1 for x in LABELS])

In [133]:
a = [0, 0]
for x in LABELS:
    a[x] += 1
a

[4266, 8694]

### Build and train example classifier

In [134]:
dt_classifier = DecisionTreeClassifier(
    criterion='entropy',
    max_depth = 8,
    min_samples_split = 10
)

In [135]:
training_set = np.array([])

for i in range(LENGTH):
    training_set = np.append(training_set, random.sample(range(0,LENGTH), 1))
    
training_set = [int(x) for x in np.unique([training_set])]

In [136]:
dt_classifier.fit(FEATURES[training_set], LABELS[training_set])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [137]:
validation_set = np.array([])
for i in range(0, LENGTH):
    if i not in training_set:
        validation_set = np.append(validation_set, i)

validation_set = [int(x) for x in validation_set]

In [138]:
predictions = np.array([])
for i in range(0, len(validation_set)):
    feature = [FEATURES[int(validation_set[i])]]
    predictions = np.append(predictions, dt_classifier.predict(feature))

In [139]:
a = [0, 0]
for x in predictions:
    a[int(x)] += 1
a

[1573, 3218]

In [140]:
a[0] / sum(a)

0.32832394072218746

### Print tree

In [141]:
def print_tree(dt):
    dot_data = StringIO()
    export_graphviz(dt, out_file=dot_data,  
                    filled=True, rounded=True,
                    special_characters=True, feature_names = oh_feature_names, class_names=["not recommended", "recommended"])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png('dt.png')
    Image(graph.create_png())

In [142]:
print_tree(dt_classifier)

![](dt.png)

In [143]:
### Confusion matrix

In [144]:
confusion_matrix(LABELS[[int(v) for v in validation_set]], predictions, normalize='true')

array([[0.89014438, 0.10985562],
       [0.04846779, 0.95153221]])