In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import javabridge
import weka.core.jvm as jvm
from weka.classifiers import Classifier
from weka.core.converters import Loader
import weka.filters as filters
from weka.classifiers import Evaluation
from weka.filters import Filter
from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
import random
from weka.core.classes import Random

In [2]:
import weka.core.jvm as jvm
jvm.start()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/usr/local/lib/python3.8/dist-packages/javabridge/jars/rhino-1.7R4.jar', '/usr/local/lib/python3.8/dist-packages/javabridge/jars/runnablequeue.jar', '/usr/local/lib/python3.8/dist-packages/javabridge/jars/cpython.jar', '/usr/local/lib/python3.8/dist-packages/weka/lib/weka.jar', '/usr/local/lib/python3.8/dist-packages/weka/lib/python-weka-wrapper.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [3]:
# create a list of dataset filenames
dataset_filenames = ['data1 Sampled Scenarios.csv.arff', 'data2 Sampled Scenarios.csv.arff', 'data3 Sampled Scenarios.csv.arff', 'data4 Sampled Scenarios.csv.arff', 'data5 Sampled Scenarios.csv.arff',
                     'data6 Sampled Scenarios.csv.arff', 'data7 Sampled Scenarios.csv.arff', 'data8 Sampled Scenarios.csv.arff', 'data9 Sampled Scenarios.csv.arff', 'data10 Sampled Scenarios.csv.arff',
                     'data11 Sampled Scenarios.csv.arff', 'data12 Sampled Scenarios.csv.arff', 'data13 Sampled Scenarios.csv.arff', 'data14 Sampled Scenarios.csv.arff', 'data15 Sampled Scenarios.csv.arff']
dataset_filenames = ["Class/multiclass/" + filename for filename in dataset_filenames]

In [8]:
# JRipper
jrp = []
jrp_eval = []
loader = Loader(classname="weka.core.converters.ArffLoader")
for filename in dataset_filenames:
  data = loader.load_file(filename)
  data.class_is_last()
  # Split the selected dataset into training and testing sets
  train, test = data.train_test_split(80)
  # Build the classifier on the training data
  #To optimize the accuracy more increase -f values decrease -n value and increase -0 values
  cls = Classifier(classname="weka.classifiers.rules.JRip", options=["-F", "3", "-N", "2.0", "-O", "2"])
  cls.build_classifier(train)
  eval = Evaluation(train)
  random.seed(1)
  eval.crossvalidate_model(cls, test, 10, Random(1))
  print(eval.percent_correct)
  jrp.append(cls)
  jrp_eval.append(eval)

81.1681772406848
82.9388560157791
84.02585410895661
86.53846153846153
82.84883720930233
83.88721047331319
81.75740210124164
86.92380056444027
80.52434456928839
86.71454219030521
81.52380952380952
88.13397129186603
82.44781783681215
85.5327468230694
83.88625592417061


In [9]:
# Print each evaluation confusion matrix and heatmap
for i in range(len(jrp_eval)):
    print(jrp_eval[i].confusion_matrix)
    print(jrp_eval[i].matrix())

[[ 14.   2.   5. ...   0.   0.   0.]
 [  0.  53.   9. ...   0.   0.   0.]
 [  2.   3. 146. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]]
=== Confusion Matrix ===

   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o   p   q   r   s   t   u   v   w   x   y   z  aa  ab  ac  ad  ae  af  ag  ah  ai  aj  ak   <-- classified as
  14   2   5   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |   a = 1
   0  53   9   0   0   1   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |   b = 2
   2   3 146   2   8   4   5   4  11   3   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 |   c = 3
   0   1   3 145   0   7   1   1   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0

In [10]:
# Save the model
import pickle
with open('jrp_model.pkl', 'wb') as f:
    pickle.dump(jrp, f)

In [11]:
# Jripper + Adaboost
jrp_adaboost = []
loader = Loader(classname="weka.core.converters.ArffLoader")
for filename in dataset_filenames:
  data = loader.load_file(filename)
  data.class_is_last()
  # Split the selected dataset into training and testing sets
  train, test = data.train_test_split(75)

  # Create a new Evaluation object for the selected attributes
  eval = Evaluation(train)

  # Build the classifier on the training datas
  base_cls = Classifier(classname="weka.classifiers.rules.JRip", options=["-F", "3", "-N", "2.0", "-O", "2"])
  cls = Classifier(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10", "-W", base_cls.classname, "--"])
  cls.build_classifier(train)
  # Make predictions on the test data using the new Evaluation object
  random.seed(1)
  eval.crossvalidate_model(cls, test, 10, Random(1))
  print(eval.percent_correct)
  jrp_adaboost.append(cls)

88.01611278952669
90.63116370808679
91.5050784856879
92.40384615384616
91.66666666666667
92.14501510574019
90.83094555873926
92.38005644402634
91.01123595505618
93.6265709156194
89.14285714285714
94.92822966507177
90.60721062618596
91.49560117302053
91.75355450236967


In [12]:
# Save the model
with open('jrp_adaboost_model.pkl', 'wb') as f:
    pickle.dump(jrp_adaboost, f)