# For SVM Classifier

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import csv
import numpy as np

!pip install liac-arff
import arff
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

import sys

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=liac_arff-2.5.0-py3-none-any.whl size=11732 sha256=e7a19eb038cd99c5d16124f8b0fe4b28d9442221c2b5e0e369e287037c204784
  Stored in directory: /root/.cache/pip/wheels/08/82/8b/5c514221984e88c059b94e36a71d4722e590acaae04deab22e
Successfully built liac-arff
Installing collected packages: liac-arff
Successfully installed liac-arff-2.5.0


In [None]:
!unzip /content/gdrive/MyDrive/CODE/GPU-master.zip

Archive:  /content/gdrive/MyDrive/CODE/GPU-master.zip
  inflating: GPU-master/bn_k2.R      
   creating: GPU-master/data/
  inflating: GPU-master/data/audiology.tar.gz  
  inflating: GPU-master/data/breast-cancer.tar.gz  
  inflating: GPU-master/data/chess.tar.gz  
  inflating: GPU-master/data/dermatology.tar.gz  
  inflating: GPU-master/data/hepatitis.tar.gz  
  inflating: GPU-master/data/lymph.tar.gz  
  inflating: GPU-master/data/nursery.tar.gz  
  inflating: GPU-master/data/pima.tar.gz  
  inflating: GPU-master/data/soybean.tar.gz  
  inflating: GPU-master/data/vote.tar.gz  
  inflating: GPU-master/gpu.py       
  inflating: GPU-master/LICENSE      
  inflating: GPU-master/README.md    


In [None]:
!pwd

/content


In [None]:
cd GPU-master

/content/GPU-master


In [None]:
datasets = ['audiology','breast-cancer', 'chess','hepatitis' , 'nursery', 'soybean', 'vote']

In [None]:
import tarfile
for i in datasets:
  file = tarfile.open('data/' + i + '.tar.gz')
  file.extractall('data/')
  file.close()

In [None]:
def write_csv(data, path):
    with open(path, "w") as csv_file:
        writer_ = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
        for line in data:
            writer_.writerow(line)

In [None]:
class_labels = ['cochlear_age','no-recurrence-events', 'won', 'DIE',  'not_recom',  'brown-spot', 'democrat']

In [None]:
print(len(class_labels))

7


In [None]:
neg_class_labels = ['cochlear_unknown', 'recurrence-events', 'nowin', 'LIVE',  'priority',  'alternarialeaf-spot', 'republican']

In [None]:
print(len(neg_class_labels))

7


In [None]:
C_values = [10**v for v in range(-8, 4, 1)]

In [None]:
gamma_values = [10**v for v in range(-6, 6, 1)]

In [None]:
with open('results', 'w') as output:
    output.write('Dataset,perc,pos,ones,precision,recall,f1-score\n')

    for dataset, class_label, neg_class_label in zip(datasets,class_labels, neg_class_labels):
        #creating feature filename for bnlearn
        data_filename = 'data/' + dataset + '_train_pos_50_1.arff'
        data = arff.load(open(data_filename, 'r'))
        features_name = 'data/' + dataset + '.features'
        log_file = 'data/' + dataset + '.log'

        out_log_file = open(log_file,"w")
        out_log_file.write('perc,fold,ones,gamma,c,precision,recall,f1-score\n')
        out_log_file.flush()

        with open(features_name, 'w') as features_file:
            for attr in data['attributes'][:-1]:
                features_file.write('"' + attr[0] + '":categorical:')
                for val in attr[1][:-1]:
                    features_file.write('"' + val + '",')
                if attr[1][-1] != '':
                    features_file.write('"' + attr[1][-1] + '".\n')
                else:
                    features_file.write('".\n')


        for perc in ['30', '40', '50']:
        # for perc in ['50']:

            precision_f =  []
            recall_f = []
            f1_score_f = []
            ones_f = []

            for fold in range(1,11):
                print('Fold:', fold)

                pos_name = 'data/' + dataset + '_train_pos_' + perc + '_' + str(fold) + '.arff'
                unl_name = 'data/' + dataset + '_train_unl_' + perc + '_' + str(fold) + '.arff'
                test_name = 'data/' + dataset + '_test_' + perc + '_' + str(fold) + '.arff'


                train_pos = arff.load(open(pos_name, 'r'))
                train_unl = arff.load(open(unl_name, 'r'))
                test = arff.load(open(test_name, 'r'))

                train_pos_data = np.array(train_pos['data'])
                train_unl_data = np.array(train_unl['data'])

                test_data = np.array(test['data'])

                write_csv(train_pos_data[:,:-1],'./data/pos.data')
                write_csv(train_unl_data[:,:-1],'./data/unl.data')


                command = 'R --no-save --args ./data/' + dataset + '.features ./data/pos ./data/unl outfile < bn_k2.R > /dev/null'

                os.system(command)

                lls = np.loadtxt('outfile')

                argsort = np.argsort(lls)

                ones = 0
                for index in argsort[:train_pos_data.shape[0]]:
                    if train_unl_data[index,-1]==class_label:
                        ones = ones + 1


                X_train_pos_neg = np.concatenate((train_pos_data[:,:-1], train_unl_data[argsort[:train_pos_data.shape[0]],:-1]), axis=0)
                y_train_pos_neg = np.array([class_label]*train_pos_data.shape[0] + [neg_class_label]*train_pos_data.shape[0])

                X_train_pos_neg_int = np.zeros((X_train_pos_neg.shape[0],X_train_pos_neg.shape[1]))
                attributes = train_pos['attributes']
                for i in range(X_train_pos_neg.shape[1]):
                    values = attributes[i][1]
                    for j in range(X_train_pos_neg.shape[0]):
                        X_train_pos_neg_int[j,i] = values.index(X_train_pos_neg[j,i])

                X_test_int = np.zeros((test_data.shape[0],test_data.shape[1]-1))
                attributes = train_pos['attributes']
                for i in range(test_data.shape[1]-1):
                    values = attributes[i][1]
                    for j in range(test_data.shape[0]):
                        X_test_int[j,i] = values.index(test_data[j,i])

                X_all_int = np.concatenate((X_train_pos_neg_int, X_test_int), axis=0)

                encoder = OneHotEncoder()
                encoder.fit(X_all_int)
                A = encoder.transform(X_train_pos_neg_int).toarray()
                B = encoder.transform(X_test_int).toarray()

                param_grid = dict(gamma=gamma_values, C=C_values)
                #cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=177)
                cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=177)
                f1_scorer = make_scorer(f1_score, pos_label=neg_class_label)
                grid = GridSearchCV(SVC(), param_grid=param_grid,  cv=cv, scoring=f1_scorer)
                grid.fit(A, y_train_pos_neg)

                print("The best parameters are %s with a score of %0.2f"
                      % (grid.best_params_, grid.best_score_))

                gamma = grid.best_params_['gamma']
                C = grid.best_params_['C']


                clf = SVC(kernel='rbf', gamma=gamma, C=C)
                clf.fit(A, y_train_pos_neg)

                y_train_pred = clf.predict(A)

                print('Train stats')
                pr = precision_score(y_train_pos_neg,y_train_pred, pos_label=neg_class_label, average="binary")
                re = recall_score(y_train_pos_neg,y_train_pred, pos_label=neg_class_label, average="binary")
                f1 = f1_score(y_train_pos_neg,y_train_pred, pos_label=neg_class_label, average="binary")
                print('Precision:', pr)
                print('Recall:', re)
                print('F1-score:', f1)

                y_test_pred = clf.predict(B)

                print('Test stats')
                pr = precision_score(test_data[:,-1],y_test_pred, pos_label=neg_class_label, average="binary")
                re = recall_score(test_data[:,-1],y_test_pred, pos_label=neg_class_label, average="binary")
                f1 = f1_score(test_data[:,-1],y_test_pred, pos_label=neg_class_label, average="binary")
                print('Precision:', pr)
                print('Recall:', re)
                print('F1-score:', f1)

                precision_f.append(pr)
                recall_f.append(re)
                f1_score_f.append(f1)
                ones_f.append(ones)

                out_log_file.write(str(perc) + ',' + str(fold) + ',' +str(ones) + ',' +str(gamma) + ',' +
                                   str(C) + ',' + str(pr) + ',' + str(re) + ',' + str(f1) +'\n')
                out_log_file.flush()

            output.write(dataset + ',' + perc + ',')
            output.write(str(train_pos_data.shape[0]) + ',' +
                         str(np.mean(ones_f)) + ',' +
                         str(np.mean(precision_f)) + ',' +
                         str(np.mean(recall_f)) + ',' +
                         str(np.mean(f1_score_f)) + '\n')

            output.flush()
        out_log_file.close()

Fold: 1
The best parameters are {'C': 1e-08, 'gamma': 0.1} with a score of 0.97
Train stats
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Test stats
Precision: 1.0
Recall: 0.4
F1-score: 0.5714285714285715
Fold: 2
The best parameters are {'C': 1, 'gamma': 0.1} with a score of 1.00
Train stats
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Test stats
Precision: 0.8333333333333334
Recall: 1.0
F1-score: 0.9090909090909091
Fold: 3
The best parameters are {'C': 1e-08, 'gamma': 0.1} with a score of 0.97
Train stats
Precision: 0.8823529411764706
Recall: 1.0
F1-score: 0.9375
Test stats
Precision: 0.625
Recall: 1.0
F1-score: 0.7692307692307693
Fold: 4
The best parameters are {'C': 1, 'gamma': 0.1} with a score of 0.97
Train stats
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Test stats
Precision: 0.8333333333333334
Recall: 1.0
F1-score: 0.9090909090909091
Fold: 5
The best parameters are {'C': 1e-08, 'gamma': 0.1} with a score of 0.89
Train stats
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Test stats
Precision: 0.6

KeyboardInterrupt: ignored