#### Load libraries

In [1]:
# Decision tree python corpus taken from: https://www.datacamp.com/community/tutorials/decision-tree-classification-python

import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree
from scipy.io import arff
import re

  _nan_object_mask = _nan_object_array != _nan_object_array


#### Functions for converting tree from graphviz format to Weka output format

In [2]:
def isLeaf ( src_str ):
    r = re.match("[0-9]+ \[label=\"", src_str)
    s = re.search("\\\\n", src_str)
    if (r != None and s == None):
        return True
    else:
        return False

def printClassName (src_str):
    r = re.search("[0-9]+ \[label=\"", src_str)
    s = re.search("\"\] ;", src_str)
    t = re.search("\\\\n", src_str)
    if (r != None and s != None and t == None):
        print(src_str[r.end():s.start()], end = "")
    
def printCondition ( src_str ):
    r = re.search("[0-9]+ \[label=\"", src_str)
    s = re.search("\\\\n", src_str)
    if (r != None and s != None):
        print(src_str[r.end():s.start()], end = "")

def getNodeNum (src_str):
    r = re.match("[0-9]+", src_str)
    if ( r != None):
        return int(r.group(0))
    else:
        return ""

def getNextLineIndex (src_list, node_num):
    tmp = []
    i = len(src_list)-1
    for line in reversed(src_list):
        if ( getNodeNum(line) == int(node_num)):
            return i
        i -= 1
    return -1

def isNodeInfo (src_str):
    if (re.match('[0-9]+ -> [0-9]+[ ]+;', src_str) != None):
        return True
    else:
        return False
def oppOperator (src_str):
    src_str = src_str.replace("<=", ">")
    src_str = src_str.replace(">=", "<")
    return src_str

def formatTree ( line, indent ):
    global leaf_size, tree_size
    
    # If the first line is just node connection info line, skip it
    if (isNodeInfo(line[0])):
        line = line[1:]

    # If the first line is a leaf, print its name with \n, othewise only \n
    if (isLeaf(line[0])):
        print(": ", end = "")
        printClassName(line[0])
        print("")
        leaf_size += 1
        tree_size += 1
        return
    else:
        print("")
        tree_size += 1

    nIndex = getNodeNum(line[0]) # Get node index
    splitIndex = getNextLineIndex (line[1:], nIndex) # Get split index

    if( len(line[1:splitIndex])> 0):
        # Print original condition
        print("|   "*indent, end = "")
        printCondition(line[0])
        formatTree(line[1:splitIndex], indent + 1) # Call recusively for the first part of original tree
        
    if ( len(line[splitIndex-1:]) > 0):
        # Print opposite condition
        print("|   "*indent, end = "")
        printCondition(oppOperator(line[0]))
        formatTree(line[splitIndex-1:], indent + 1)# Call recusively for the second part of original tree
        
def printTree (dot_tree):
    new_tree = []
     
    # Preprocess the tree 
    for line in dot_tree.split("\n"):
        r = re.search("[0-9]+\\\\n\[([0-9]+[,]?[ ]?)+\]\\\\n", line)
        s = re.search("\[labeldistance=[0-9]+\.?[0-9]*, labelangle=-?[0-9]+, headlabel=\"(False|True)\"\]", line)
        if ( r != None ):
            line = line[:r.start()]+line[r.end():]
        if ( s != None ):
            line = line[:s.start()]+line[s.end():]
        new_tree.append(line)
    
    # Print in Weka format
    formatTree ( new_tree[2:-1], 0 )
    
    print(f"\nNumber of Leaves  : \t{leaf_size}")
    print(f"\nSize of the tree : \t{tree_size}")

SyntaxError: invalid syntax (<ipython-input-2-ab7a29bf7fd8>, line 98)

#### Load dataset from arff file

In [3]:
# Decision tree python corpus taken from: https://www.datacamp.com/community/tutorials/decision-tree-classification-python

data = arff.loadarff('test.arff')  # <- Write desired file here
data_set = pd.DataFrame(data[0])
data_set['class'] = data_set['class'].str.decode('ASCII')
col_names = list(data_set)

feature_cols = col_names[:-1]
class_name = list(set(data_set[col_names[-1]]))

#### Split and shuffle our data to train data (70%) and test data (30%), train the decision tree and print accuracy of our tree

In [4]:
# Decision tree python corpus taken from: https://www.datacamp.com/community/tutorials/decision-tree-classification-python

X = data_set[feature_cols] # Features
y = data_set[col_names[-1]] # Target variable

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier() # You can specify the max depth by passing argument for example: max_depth=3

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("\nAccuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 1.0


#### Export our tree to graphviz format and change it to weka format

In [5]:
leaf_size = 0
tree_size = 0

dot_tree = tree.export_graphviz(clf, out_file=None, class_names=class_name, label= "none", impurity = False, feature_names = feature_cols)

printTree (dot_tree)


F2_ABS(VAR)_on_ACC_V <= 0.008
|   F1_ABS(MEAN)_on_ACC_V <= 1.025
|   |   F1_ABS(MEAN)_on_ACC_V <= 1.016: moving
|   |   F1_ABS(MEAN)_on_ACC_V > 1.016
|   |   |   F2_ABS(VAR)_on_ACC_V <= 0.002
|   |   |   |   F1_ABS(MEAN)_on_ACC_V <= 1.022: still
|   |   |   |   F1_ABS(MEAN)_on_ACC_V > 1.022: moving
|   |   |   F2_ABS(VAR)_on_ACC_V > 0.002: still
|   F1_ABS(MEAN)_on_ACC_V > 1.025: moving
F2_ABS(VAR)_on_ACC_V > 0.008
|   F2_ABS(VAR)_on_ACC_V <= 0.009
|   |   F1_ABS(MEAN)_on_ACC_V <= 1.032
|   |   |   F1_ABS(MEAN)_on_ACC_V <= 0.963: moving
|   |   |   F1_ABS(MEAN)_on_ACC_V > 0.963: still
|   |   F1_ABS(MEAN)_on_ACC_V > 1.032: moving
|   F2_ABS(VAR)_on_ACC_V > 0.009: moving

Number of Leaves  : 	9

Size of the tree : 	17
