In [9]:
from yarc import CBA
from yarc.Structure import TransactionDB
from yarc.Mine_Classi_Alg.generating_CARS import ClassAssocationRule, Antecedent, Consequent, top_rules, CARlist
from yarc.Mine_Classi_Alg.m2classi import M2Classi
from yarc.Mine_Classi_Alg.predictor import Predictor
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from yarc.qcba.data_structures import *
from yarc.qcba import *
from yarc.qcba.transformation import *
import Orange
from sklearn.preprocessing import LabelEncoder
from Orange.data.pandas_compat import table_from_frame,table_to_frame

import warnings
warnings.filterwarnings('ignore')

In [26]:
def discretiseRule(X):
    temp = Orange.data.Table(X)
    disc = Orange.preprocess.Discretize()
    disc.method = Orange.preprocess.discretize.EqualFreq(n=3)
    d_temp = disc(temp)
    X= table_to_frame(d_temp)
    return X

def runQCBA(X,y,target):
    X = discretiseRule(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=25, test_size=0.2, stratify = y)
    train = pd.concat([X_train, y_train], axis=1)
    test=pd.concat([X_test, y_test], axis=1)
    
    txns = TransactionDB.from_DataFrame(train, target=target)
    txnstest = TransactionDB.from_DataFrame(test, target=target)
    
    cba=CBA()
    cba.fit(txns)
    acc1=cba.rule_model_accuracy(txnstest) #Part 2 model
    # get the best association rules
    rules = top_rules(txns.string_representation)
    # convert them to class association rules
    cars = CARlist(rules)
    predictor = M2Classi(cars, txns).build()
    acc2= predictor.test_transactions(txnstest)
    ds = test.reset_index() #test set of undis
    quant_dataset = QuantitativeDataFrame(ds)
    
    Y = ds[target]
    rules=cba.pre.rules
    quant_rules = [ QuantitativeCAR(r) for r in rules ] #rules of undis
    
    rules2 = predictor.rules
    quant_rules2 = [ QuantitativeCAR(r) for r in rules2 ] #rules of undis
    
    qcba_transformation = QCBATransformation(quant_dataset)
    refitted_rules = qcba_transformation.refitter.transform(quant_rules)
    literal_pruned_rules = qcba_transformation.literal_pruner.transform(refitted_rules)
    trimmed_rules = qcba_transformation.trimmer.transform(literal_pruned_rules)
    pruned_rules, default_class = qcba_transformation.post_pruner.transform(trimmed_rules)
    refitted_rules2 = qcba_transformation.refitter.transform(quant_rules2)
    literal_pruned_rules2 = qcba_transformation.literal_pruner.transform(refitted_rules2)
    trimmed_rules2 = qcba_transformation.trimmer.transform(literal_pruned_rules2)
    pruned_rules2, default_class2 = qcba_transformation.post_pruner.transform(trimmed_rules2)
    q_clf2 = QuantitativeClassifier(pruned_rules, default_class)
    acc3=q_clf2.rule_model_accuracy(quant_dataset, Y) #Part 5 model
    q_clf1 = QuantitativeClassifier(pruned_rules2, default_class2)
    acc4=q_clf1.rule_model_accuracy(quant_dataset, Y) #Part 5 model(top rules)
    #Display part 2 model
    print("Accuracy of CBA:",acc1)
    print("Accuracy of CBA (Top k):",acc2)
    #Display part 5 model
    print("Accuracy of QCBA:",acc3)
    print("Accuracy of QCBA (Top k):",acc4)

# Iris

In [27]:
iris = pd.read_csv ('iris.csv')
X=iris[["sepallength","petalwidth","sepalwidth","petallength"]]
y=iris[["class"]]
runQCBA(X,y,"class")

Accuracy of CBA: 0.9
Accuracy of CBA (Top k): 0.9
Accuracy of QCBA: 0.9333333333333333
Accuracy of QCBA (Top k): 0.9333333333333333


# Wave

In [28]:
wave = pd.read_csv ('wave.csv')
wave = wave.dropna()
X=wave[['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20']]
y=wave[["class"]]
runQCBA(X,y,"class")

Accuracy of CBA: 0.764
Accuracy of CBA (Top k): 0.797
Accuracy of QCBA: 0.768
Accuracy of QCBA (Top k): 0.81


# Heart

In [None]:
heart = pd.read_csv ('heart.csv')
heart = heart.dropna()
X=heart[["age", "sex", "chest pain type", "resting blood pressure", "serum cholesterol (mg/dl)", "resting blood sugar >120mg/dl","resting electrocariographic results","maximum heart rate received","exercise induced angina", "oldpeak","slopePeak", "numMajorVessels","thal"]]
y=heart[["class"]]
runQCBA(X,y,"class")

# Breast Cancer

In [None]:
from sklearn.preprocessing import OrdinalEncoder
breastCancer = pd.read_csv ('breastCancer.csv')
breastCancer = breastCancer.dropna()
X=breastCancer[["menopause", "age", "tumor-size", "inv-nodes", "node-caps","deg-malig","breast","breast-quad","irradiat"]]
#encoding the normal data
for col in ["menopause","node-caps","breast","breast-quad","irradiat"]:
   X[col] = LabelEncoder().fit_transform(X[col])
#encoding the ordinal data #Ordinal Encoder not working so i just hardcoded first
ordinalData = ["age","tumor-size","inv-nodes"]
for i in ordinalData:
   columns = X[i].unique()
   columns.sort() #sorting the labels
   for j in range(len(columns)):
      X.loc[X[i]==columns[j],i] = j
y=breastCancer[["class"]]
runQCBA(X,y,"class")

# German

In [None]:
german = pd.read_csv ('german.csv')
german = german.dropna()
X=german[["checkAccStatus", "durationMth", "credHist", "purpose", "credAmt","savAccBond","emplySince","instRate","personalStatSex","otherDebtGuar","presResSince","prpty","age(years)","otherInstallPlans","housing","numExistCreds","job","numPplMaintain","telephone","frgnWorker"]]
for col in ['checkAccStatus','credHist','purpose','savAccBond','emplySince','personalStatSex','otherDebtGuar','prpty','otherInstallPlans','housing','job','telephone','frgnWorker']:
    X[col] = LabelEncoder().fit_transform(X[col])
y=german[["goodBad"]]
runQCBA(X,y,"goodBad")

# University Student Placement

In [None]:
campusPlacement = pd.read_csv("Placement_Data_Full_Class.csv")
campusPlacement.head()
campusPlacement = campusPlacement.drop(['salary'],1)
campusPlacement = campusPlacement.dropna()
X = campusPlacement[['sl_no', 'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p']]
#creating a function to encode categorical features into numerical
for col in ['gender','ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation']:
    X[col] = LabelEncoder().fit_transform(X[col])
y=campusPlacement[['status']]
runQCBA(X,y,"status")

# Stroke

In [None]:
stroke = pd.read_csv('healthcare-dataset-stroke-data.csv')
stroke = stroke.drop(['id'],1)
stroke = stroke.dropna()
X=stroke[["gender", "age", "heart_disease", "ever_married", "work_type","Residence_type","avg_glucose_level","bmi","smoking_status"]]
#encoding the normal data
for col in ["gender","ever_married","work_type","Residence_type", "smoking_status"]:
   X[col] = LabelEncoder().fit_transform(X[col])
y=stroke[["stroke"]]
runQCBA(X,y,"stroke")