In [1]:
# !pip install liac-arff
# !pip install scikit-multilearn
# !pip uninstall scikit-learn -y
# !pip install scikit-learn==0.24.1

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
import tqdm
import time
import random
import numpy as np
from scipy import spatial
from scipy.stats import pearsonr
from skmultilearn.dataset import load_from_arff

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import BinaryRelevance

In [4]:
X, y = load_from_arff("Scene.arff", label_count = 6)
X = X.toarray()
y = y.toarray()

index = np.arange(len(X))
np.random.shuffle(index)
X, y = X[index], y[index]

print(X.shape, y.shape)

(2407, 294) (2407, 6)


In [5]:
print(X)
print(y)

[[0.716098 0.718348 0.714864 ... 0.013592 0.004946 0.004211]
 [0.885591 0.879806 0.860725 ... 0.009589 0.001896 0.002645]
 [0.631132 0.604911 0.497497 ... 0.050385 0.141724 0.169806]
 ...
 [0.484628 0.533303 0.577971 ... 0.080438 0.058844 0.043378]
 [0.698284 0.744366 0.774499 ... 0.003052 0.00363  0.00377 ]
 [0.537221 0.53136  0.484572 ... 0.007565 0.008    0.016748]]
[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 0 1]
 ...
 [0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]]


In [6]:
train_samples = int(X.shape[0] * 0.6)

X_train = X[:train_samples, :]
y_train = y[:train_samples]
X_test = X[train_samples:, :]
y_test = y[train_samples:]

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

X_train: (1444, 294) y_train: (1444, 6)
X_test: (963, 294) y_test: (963, 6)


In [7]:
d = X_train.shape[1] 
l = y_train.shape[1]
print("d:", d, "l:", l)

d: 294 l: 6


In [8]:
def score(y, pred):
    pred = pred.toarray()
    score = 0
    for i in range(y.shape[0]):
        score += np.sum(y[i] * pred[i]) / (np.sum(y[i]) + np.sum(pred[i]) - np.sum(y[i] * pred[i]))
    score /= y.shape[0]
    return score

## Without FS, taking the whole dataset

In [9]:
X_train_subset = X_train[:, :]
X_test_subset = X_test[:, :]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

X_train_subset: (1444, 294) y_train: (1444, 6)
X_test_subset: (963, 294) y_test: (963, 6)
lr train accu: 0.7281
lr test_accu: 0.5692
nb train accu: 0.4916
nb test_accu: 0.4664
knn train accu: 0.7509
knn test_accu: 0.6506
avg train accu: 0.6569
avg test_accu: 0.5621


In [10]:
beg = time.time()
fcorr = np.corrcoef(X_train.T)
end = time.time()
fcorr = np.absolute(fcorr)
print(fcorr.shape)
print(fcorr)

(294, 294)
[[1.         0.90458739 0.80937588 ... 0.02353183 0.0133419  0.00882624]
 [0.90458739 1.         0.92395526 ... 0.03424692 0.02484997 0.00205051]
 [0.80937588 0.92395526 1.         ... 0.05210499 0.04313094 0.01876825]
 ...
 [0.02353183 0.03424692 0.05210499 ... 1.         0.74017764 0.54948212]
 [0.0133419  0.02484997 0.04313094 ... 0.74017764 1.         0.69639586]
 [0.00882624 0.00205051 0.01876825 ... 0.54948212 0.69639586 1.        ]]


In [11]:
flcorr = np.zeros((d, l))
for i in tqdm.tqdm(range(d)):
    for j in range(l):
        flcorr[i][j] = pearsonr(X_train[:, i], y_train[:, j])[0] + 0.001
flcorr = np.absolute(flcorr)
flcorr = np.max(flcorr, axis = 1)
print(flcorr.shape)
print(flcorr)

100%|██████████| 294/294 [00:00<00:00, 1254.03it/s]

(294,)
[0.28273421 0.26590678 0.25509439 0.25928995 0.27549983 0.29056431
 0.28699719 0.22803796 0.21976526 0.21204444 0.21241481 0.25181592
 0.25576506 0.24167818 0.16062914 0.15751375 0.13306506 0.11759181
 0.17233292 0.18768456 0.19209626 0.19461931 0.1702976  0.17727248
 0.16527699 0.19810162 0.19973014 0.19996536 0.28333268 0.31780083
 0.30706085 0.3048099  0.28412383 0.28118102 0.29941676 0.36576758
 0.38183213 0.39368493 0.3904136  0.38769194 0.38075362 0.36586549
 0.40078403 0.41180708 0.42084785 0.40756572 0.41440625 0.40847055
 0.39835166 0.23165973 0.23340933 0.22696055 0.22292932 0.19096343
 0.18530888 0.20247687 0.19352218 0.13315675 0.11467273 0.11914302
 0.09724758 0.12562534 0.15985459 0.17632667 0.1075549  0.08525056
 0.10154026 0.0706012  0.06457255 0.11526652 0.11515333 0.1021791
 0.1479756  0.12160149 0.096101   0.14774991 0.1153071  0.16059333
 0.17024368 0.21433376 0.21372298 0.19735482 0.20528545 0.18498176
 0.25317081 0.22364909 0.25461264 0.28993096 0.28770957 




## Base paper ka ACO, 10 features ka subset

In [12]:
tou = []
for i in tqdm.tqdm(range(d)):
    tou0 = 0
    for j in range(l):
        tou0 = max(tou0, 1 - spatial.distance.cosine(X_train[:, i], y_train[:, j]))
    tou.append(tou0)
tou = np.array(tou)
tou = (tou - np.min(tou)) / (np.max(tou) - np.min(tou))

q0 = 0.7
rho = 0.1
ants = 25
iterations = 40

for it in tqdm.tqdm(range(iterations)):
    fc = [0] * d
    for ant in range(ants):

        visited = set()
        unvisited = set(range(d))
        i = random.sample(list(unvisited), 1)[0]
        visited.add(i)
        unvisited.remove(i)

        while len(visited) < 10:
            ni = list(unvisited)
            p = []
            for j in ni:
                p.append(tou[j] * flcorr[j] / fcorr[i][j])
            p = np.array(p)
            p /= np.sum(p)
            q = random.random()
            j = -1
            if q >= q0:
                j = np.random.choice(ni, p = p)
            else:
                j = ni[np.argmax(p)]
            unvisited.remove(j)
            visited.add(j)
            i = j
        for i in visited:
            fc[i]+=1

    for i in range(d):
        tou[i] = (1 - rho) * tou[i] + fc[i] / sum(fc)

fs_ind = []
for i in range(d):
    fs_ind.append([tou[i], i])
fs_ind.sort(reverse = True)
fs_ind = np.array(fs_ind)
fs_ind = fs_ind[:10, 1].astype(int)

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 294/294 [00:00<00:00, 2410.20it/s]
100%|██████████| 40/40 [00:04<00:00,  8.48it/s]


[241 239 106 108  84  86 243 103 107 185]
X_train_subset: (1444, 10) y_train: (1444, 6)
X_test_subset: (963, 10) y_test: (963, 6)
lr train accu: 0.2425
lr test_accu: 0.2272
nb train accu: 0.3811
nb test_accu: 0.3702
knn train accu: 0.5819
knn test_accu: 0.4273
avg train accu: 0.4019
avg test_accu: 0.3416


## Base paper ke ACO ka solution further optimized with random restructure LS with KNN wrapper

In [13]:
tmp = fs_ind[:]

In [14]:
X_train_subset = X_train[:, fs_ind]

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)
best_acc = score(y_train, knn.predict(X_train_subset))

ants = 40

for ant in tqdm.tqdm(range(ants)):

    visited = set(random.sample(fs_ind.tolist(), 5))
    unvisited = set(range(d)) - visited
    i = random.sample(list(visited), 1)[0]

    while len(visited) < 10:
        ni = list(unvisited)
        p = []
        for j in ni:
            p.append(tou[j] * flcorr[j] / fcorr[i][j])
        p = np.array(p)
        p /= np.sum(p)
        j = ni[np.argmax(p)]
        unvisited.remove(j)
        visited.add(j)
        i = j

    ind = np.array(list(visited))
    X_train_subset = X_train[:, ind]

    knn = BinaryRelevance(classifier = KNeighborsClassifier())
    knn.fit(X_train_subset, y_train)
    acc = score(y_train, knn.predict(X_train_subset))

    if acc > best_acc:
        best_acc = acc
        fs_ind = ind[:]

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 40/40 [00:20<00:00,  1.91it/s]


[106 107 108  47 112 241 239 243  84 216]
X_train_subset: (1444, 10) y_train: (1444, 6)
X_test_subset: (963, 10) y_test: (963, 6)
lr train accu: 0.3401
lr test_accu: 0.3072
nb train accu: 0.4415
nb test_accu: 0.4249
knn train accu: 0.6615
knn test_accu: 0.5102
avg train accu: 0.4811
avg test_accu: 0.4141


## Base paper ke ACO ka solution further optimized with random restructure LS with KNN + NB + LR wrapper

In [15]:
fs_ind = tmp[:]

In [16]:
X_train_subset = X_train[:, fs_ind]

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)
lr_train_score = score(y_train, lr.predict(X_train_subset))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)
nb_train_score = score(y_train, nb.predict(X_train_subset))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)
knn_train_score = score(y_train, knn.predict(X_train_subset))

best_acc = (lr_train_score + nb_train_score + knn_train_score) / 3

ants = 40

for ant in tqdm.tqdm(range(ants)):

    visited = set(random.sample(fs_ind.tolist(), 5))
    unvisited = set(range(d)) - visited
    i = random.sample(list(visited), 1)[0]

    while len(visited) < 10:
        ni = list(unvisited)
        p = []
        for j in ni:
            p.append(tou[j] * flcorr[j] / fcorr[i][j])
        p = np.array(p)
        p /= np.sum(p)
        j = ni[np.argmax(p)]
        unvisited.remove(j)
        visited.add(j)
        i = j

    ind = np.array(list(visited))
    X_train_subset = X_train[:, ind]

    lr = BinaryRelevance(classifier = LogisticRegression())
    lr.fit(X_train_subset, y_train)
    lr_train_score = score(y_train, lr.predict(X_train_subset))

    nb = BinaryRelevance(classifier = GaussianNB())
    nb.fit(X_train_subset, y_train)
    nb_train_score = score(y_train, nb.predict(X_train_subset))

    knn = BinaryRelevance(classifier = KNeighborsClassifier())
    knn.fit(X_train_subset, y_train)
    knn_train_score = score(y_train, knn.predict(X_train_subset))

    acc = (lr_train_score + nb_train_score + knn_train_score) / 3

    if acc > best_acc:
        best_acc = acc
        fs_ind = ind[:]

print(fs_ind)

X_train_subset = X_train[:, fs_ind]
X_test_subset = X_test[:, fs_ind]

print("X_train_subset:", X_train_subset.shape, "y_train:", y_train.shape)
print("X_test_subset:", X_test_subset.shape, "y_test:", y_test.shape)

lr = BinaryRelevance(classifier = LogisticRegression())
lr.fit(X_train_subset, y_train)

lr_train_score = score(y_train, lr.predict(X_train_subset))
lr_test_score = score(y_test, lr.predict(X_test_subset))

print("lr train accu:", round(lr_train_score, 4))
print("lr test_accu:", round(lr_test_score, 4))

nb = BinaryRelevance(classifier = GaussianNB())
nb.fit(X_train_subset, y_train)

nb_train_score = score(y_train, nb.predict(X_train_subset))
nb_test_score = score(y_test, nb.predict(X_test_subset))

print("nb train accu:", round(nb_train_score, 4))
print("nb test_accu:", round(nb_test_score, 4))

knn = BinaryRelevance(classifier = KNeighborsClassifier())
knn.fit(X_train_subset, y_train)

knn_train_score = score(y_train, knn.predict(X_train_subset))
knn_test_score = score(y_test, knn.predict(X_test_subset))

print("knn train accu:", round(knn_train_score, 4))
print("knn test_accu:", round(knn_test_score, 4))

avg_train_score = (lr_train_score + nb_train_score + knn_train_score) / 3
avg_test_score = (lr_test_score + nb_test_score + knn_test_score) / 3

print("avg train accu:", round(avg_train_score, 4))
print("avg test_accu:", round(avg_test_score, 4))

100%|██████████| 40/40 [00:28<00:00,  1.41it/s]


[ 75 108  46 239  47 146  84  86 216 286]
X_train_subset: (1444, 10) y_train: (1444, 6)
X_test_subset: (963, 10) y_test: (963, 6)
lr train accu: 0.366
lr test_accu: 0.3191
nb train accu: 0.529
nb test_accu: 0.4826
knn train accu: 0.6377
knn test_accu: 0.4834
avg train accu: 0.5109
avg test_accu: 0.4284
