In [1]:
!pip install numpy
!pip install python-javabridge
!pip install python-weka-wrapper3



### Now we will install datasets and check their matching with their descriptions in the paper

In [9]:
# check the correctness of datasets 

def check_correctness(file, attributes,instances,classes):
  from scipy.io import arff
  import pandas as pd
  import numpy
  data = arff.loadarff(file)
  df = pd.DataFrame(data[0])
  print('instances: ',df.shape[0])
  print('attributes: ',df.shape[1]-1)
  print('classes: ',(len(set(df.iloc[:,df.shape[1]-1]))))
  assert df.shape[0]==instances,"instances number does not match"
  assert df.shape[1]==attributes+1,"attributes number does not match"
  assert(len(set(df.iloc[:,df.shape[1]-1])))==classes,"number of classes does not match"

In [10]:
# glass dataset 
!wget https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/glass.arff
check_correctness('glass.arff',9,214,6)

--2021-12-23 17:09:05--  https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/glass.arff
Resolving storm.cis.fordham.edu (storm.cis.fordham.edu)... 150.108.68.26
Connecting to storm.cis.fordham.edu (storm.cis.fordham.edu)|150.108.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17823 (17K) [text/plain]
Saving to: ‘glass.arff.12’


2021-12-23 17:09:06 (92.3 KB/s) - ‘glass.arff.12’ saved [17823/17823]

instances:  214
attributes:  9
classes:  6


In [178]:
# diabetes dataset
!wget https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/diabetes.arff 
check_correctness('diabetes.arff',8,768,2)

--2021-12-23 16:34:54--  https://storm.cis.fordham.edu/~gweiss/data-mining/weka-data/diabetes.arff
Resolving storm.cis.fordham.edu (storm.cis.fordham.edu)... 150.108.68.26
Connecting to storm.cis.fordham.edu (storm.cis.fordham.edu)|150.108.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37419 (37K) [text/plain]
Saving to: ‘diabetes.arff.5’


2021-12-23 16:34:55 (199 KB/s) - ‘diabetes.arff.5’ saved [37419/37419]

instances:  768
attributes:  8
classes:  2


### Starting JVM

In [3]:
# start JVM
import weka.core.jvm as jvm

jvm.start(packages=True)

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/usr/local/lib/python3.7/dist-packages/javabridge/jars/rhino-1.7R4.jar', '/usr/local/lib/python3.7/dist-packages/javabridge/jars/runnablequeue.jar', '/usr/local/lib/python3.7/dist-packages/javabridge/jars/cpython.jar', '/usr/local/lib/python3.7/dist-packages/weka/lib/weka.jar', '/usr/local/lib/python3.7/dist-packages/weka/lib/python-weka-wrapper.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support enabled


In [16]:
import weka.core.packages as packages
#packages.refresh_cache()  # uncomment this to query for updates
packages.install_package("hyperPipes")

True

### installing needed packages

In [5]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname

#jvm.start(packages=True)

pkg = "NNge"

# install package if necessary
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    packages.install_package(pkg)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)

# testing classname completion
print(complete_classname(".J48"))
#print(complete_classname(".rules.NNge"))

weka.classifiers.trees.J48


In [13]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname

#jvm.start(packages=True)

pkg = "votingFeatureIntervals"

# install package if necessary
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    packages.install_package(pkg)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)

# testing classname completion
print(complete_classname(".J48"))
print(complete_classname(".VFI"))

weka.classifiers.trees.J48
weka.classifiers.misc.VFI


In [11]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname

#jvm.start(packages=True)

pkg = "conjunctiveRule"

# install package if necessary
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    packages.install_package(pkg)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)

# testing classname completion
print(complete_classname(".J48"))
print(complete_classname(".ConjunctiveRule"))

weka.classifiers.trees.J48
weka.classifiers.rules.ConjunctiveRule


In [4]:
import sys
import weka.core.jvm as jvm
import weka.core.packages as packages
from weka.core.classes import complete_classname

#jvm.start(packages=True)

pkg = "hyperPipes"

# install package if necessary
if not packages.is_installed(pkg):
    print("Installing %s..." % pkg)
    packages.install_package(pkg)
    print("Installed %s, please re-run script!" % pkg)
    jvm.stop()
    sys.exit(0)

# testing classname completion
print(complete_classname(".J48"))
print(complete_classname(".HyperPipes"))


weka.classifiers.trees.J48
weka.classifiers.misc.HyperPipes


In [None]:
# stop JVM
#jvm.stop()

# Load Data

In [6]:
import weka.core.converters as converters
data = converters.load_any_file("glass.arff")
data.class_is_last()

subset = data.subset(row_range='1-3') # print the first rows from the data
print(subset)

@relation 'Glass-weka.filters.unsupervised.instance.RemoveRange-V-R1-3-weka.filters.MultiFilter-Fweka.filters.unsupervised.instance.RemoveRange -V -R 1-3'

@attribute RI numeric
@attribute Na numeric
@attribute Mg numeric
@attribute Al numeric
@attribute Si numeric
@attribute K numeric
@attribute Ca numeric
@attribute Ba numeric
@attribute Fe numeric
@attribute Type {'build wind float','build wind non-float','vehic wind float','vehic wind non-float',containers,tableware,headlamps}

@data
1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float'
1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float'
1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0,0,'build wind float'


In [7]:
# evaluation function 
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

def evaluat (classifier, data, fold):
  evaluation = Evaluation(data)                     # initialize with priors
  evaluation.crossvalidate_model(classifier, data, fold, Random(42))  # 10-fold CV
  print(evaluation.summary())

In [None]:
# saving model function 
def save_model(classifier,name,path=""):
  import os
  import tempfile
  #import wekaexamples.helper as helper
  import weka.core.serialization as serialization
  outfile = tempfile.gettempdir() + os.sep + path+name+".model"
  serialization.write(outfile, classifier)
  return True

#Build Base classifiers on dataset, output predictions

In [10]:
#3 Naive Bayes classifier
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         104               48.5981 %
Incorrectly Classified Instances       110               51.4019 %
Kappa statistic                          0.3172
Mean absolute error                      0.1556
Root mean squared error                  0.3395
Relative absolute error                 73.4622 %
Root relative squared error            104.6219 %
Total Number of Instances              214     



In [17]:
#1 c4.5
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         138               64.486  %
Incorrectly Classified Instances        76               35.514  %
Kappa statistic                          0.5213
Mean absolute error                      0.1042
Root mean squared error                  0.295 
Relative absolute error                 49.1964 %
Root relative squared error             90.8886 %
Total Number of Instances              214     



In [18]:
#2 c4.5 using unpruned tree 

# c4.5
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U"])
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         139               64.9533 %
Incorrectly Classified Instances        75               35.0467 %
Kappa statistic                          0.5268
Mean absolute error                      0.1032
Root mean squared error                  0.2944
Relative absolute error                 48.7418 %
Root relative squared error             90.7276 %
Total Number of Instances              214     



In [19]:
#4 PART classifier it  builds a partial C4.5 decision tree in each iteration and makes the "best" leaf into a rule.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.PART")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         147               68.6916 %
Incorrectly Classified Instances        67               31.3084 %
Kappa statistic                          0.5724
Mean absolute error                      0.0955
Root mean squared error                  0.2788
Relative absolute error                 45.118  %
Root relative squared error             85.9112 %
Total Number of Instances              214     



In [20]:
#5 Decision Stump It is an algorithm
#that generates one level decision tree.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.DecisionStump")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances          95               44.3925 %
Incorrectly Classified Instances       119               55.6075 %
Kappa statistic                          0.2356
Mean absolute error                      0.1829
Root mean squared error                  0.3035
Relative absolute error                 86.3841 %
Root relative squared error             93.5141 %
Total Number of Instances              214     



In [21]:
#6 Desicion table using majority classifier 
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.DecisionTable")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         149               69.6262 %
Incorrectly Classified Instances        65               30.3738 %
Kappa statistic                          0.57  
Mean absolute error                      0.1719
Root mean squared error                  0.2766
Relative absolute error                 81.1587 %
Root relative squared error             85.2364 %
Total Number of Instances              214     



In [22]:
#7 Desicion table using Using nearest neighbour instead of global table majority.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.DecisionTable",options=["-I"])
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         149               69.6262 %
Incorrectly Classified Instances        65               30.3738 %
Kappa statistic                          0.5767
Mean absolute error                      0.168 
Root mean squared error                  0.2757
Relative absolute error                 79.3275 %
Root relative squared error             84.942  %
Total Number of Instances              214     



In [23]:
#8 ClassificationViaRegression / The M5 algorithm (Quinlan 1992) is
#used as the regression method. (not yet)
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.meta.ClassificationViaRegression")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         148               69.1589 %
Incorrectly Classified Instances        66               30.8411 %
Kappa statistic                          0.572 
Mean absolute error                      0.13  
Root mean squared error                  0.2508
Relative absolute error                 61.3987 %
Root relative squared error             77.275  %
Total Number of Instances              214     



In [24]:
#9 Random Forest This algorithm constructs
#a Random Forest that forms combining a great number of unpruned decision trees.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.RandomForest")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         169               78.972  %
Incorrectly Classified Instances        45               21.028  %
Kappa statistic                          0.71  
Mean absolute error                      0.0987
Root mean squared error                  0.2109
Relative absolute error                 46.6187 %
Root relative squared error             64.9862 %
Total Number of Instances              214     



In [25]:
#10 Random Tree 
# This algorithm constructs a tree considering K random attributes at each node. It does not carry out any pruning
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.RandomTree")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         145               67.757  %
Incorrectly Classified Instances        69               32.243  %
Kappa statistic                          0.5589
Mean absolute error                      0.0921
Root mean squared error                  0.3035
Relative absolute error                 43.504  %
Root relative squared error             93.523  %
Total Number of Instances              214     



In [15]:
#11 trees.LMT
# This algorithm constructs a tree considering K random attributes at each node. It does not carry out any pruning
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.trees.LMT")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))

#evaluat(classifier=cls, data=data, fold=10)
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         152               71.028  %
Incorrectly Classified Instances        62               28.972  %
Kappa statistic                          0.6074
Mean absolute error                      0.0849
Root mean squared error                  0.2655
Relative absolute error                 40.07   %
Root relative squared error             81.8131 %
Total Number of Instances              214     



In [26]:
#12 VFI
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.misc.VFI")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)



Correctly Classified Instances         119               55.6075 %
Incorrectly Classified Instances        95               44.3925 %
Kappa statistic                          0.4344
Mean absolute error                      0.2082
Root mean squared error                  0.3136
Relative absolute error                 98.3133 %
Root relative squared error             96.6223 %
Total Number of Instances              214     



In [12]:
#12 Conjunctive Rule. This algorithm generates a simple conjunctive
#rules classifier.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.ConjunctiveRule")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances          96               44.8598 %
Incorrectly Classified Instances       118               55.1402 %
Kappa statistic                          0.2411
Mean absolute error                      0.1821
Root mean squared error                  0.3029
Relative absolute error                 85.9846 %
Root relative squared error             93.3187 %
Total Number of Instances              214     



In [6]:
#13 JRip
# algorithm that generates propositional
#rules.
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.JRip")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         149               69.6262 %
Incorrectly Classified Instances        65               30.3738 %
Kappa statistic                          0.5764
Mean absolute error                      0.1139
Root mean squared error                  0.27  
Relative absolute error                 53.7685 %
Root relative squared error             83.21   %
Total Number of Instances              214     



In [13]:
#14 Nnge (Martin 1995). It is a nearest neighbor algorithm
#that use non-nested generalized exemplars
from weka.classifiers import Classifier
cls = Classifier(classname="weka.classifiers.rules.NNge")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))

evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         145               67.757  %
Incorrectly Classified Instances        69               32.243  %
Kappa statistic                          0.558 
Mean absolute error                      0.0921
Root mean squared error                  0.3035
Relative absolute error                 43.504  %
Root relative squared error             93.523  %
Total Number of Instances              214     



In [8]:
#15 HyperPipes (Witten & Frank 2000). It generates a classifier
#that constructs a HyperPipe for each category, which
#contains all the points of that category.
from weka.classifiers import Classifier

cls = Classifier(classname="weka.classifiers.misc.HyperPipes")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))

evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         109               50.9346 %
Incorrectly Classified Instances       105               49.0654 %
Kappa statistic                          0.2907
Mean absolute error                      0.2281
Root mean squared error                  0.332 
Relative absolute error                107.7397 %
Root relative squared error            102.3    %
Total Number of Instances              214     



### Adding StackingC classifiers 

In [27]:
#1 IBK Algorithm 
#K-nearest neighbours classifier. Can select appropriate value of K based on cross-validation. Can also do distance weighting.
from weka.classifiers import Classifier

cls = Classifier(classname="weka.classifiers.lazy.IBk")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))

evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         146               68.2243 %
Incorrectly Classified Instances        68               31.7757 %
Kappa statistic                          0.5702
Mean absolute error                      0.0962
Root mean squared error                  0.2963
Relative absolute error                 45.4183 %
Root relative squared error             91.2901 %
Total Number of Instances              214     



In [39]:
#2 K* classifier 
# K* is an instance-based classifier, that is the class of a test instance is based upon the class of those training instances similar to it, as determined by some similarity function. It differs from other instance-based learners in that it uses an entropy-based distance function.
cls = Classifier(classname="weka.classifiers.lazy.KStar")
cls.build_classifier(data)

for index, inst in enumerate(data):
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    #print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))

evaluat(classifier=cls, data=data, fold=10)


Correctly Classified Instances         162               75.7009 %
Incorrectly Classified Instances        52               24.2991 %
Kappa statistic                          0.6651
Mean absolute error                      0.0731
Root mean squared error                  0.2348
Relative absolute error                 34.4985 %
Root relative squared error             72.3454 %
Total Number of Instances              214     

