In [1]:
import joblib
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn import preprocessing

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor, ClassifierChain
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter
import re

Read files

In [2]:
labels = {}
with open("labels.txt", 'r') as file:
 for line in file:
 
    (value, key) = line.split()
    labels[int(key)] = value[:-1]

In [3]:
# vocabs = {}
# with open("vocabs.txt", 'r') as file:
#  for line in file:
 
#     (value, key) = line.split()
#     vocabs[int(key)] = value[:-1]

take all the line labels

In [4]:
train_label = np.loadtxt('train-label.dat', unpack = True)
y_train = pd.DataFrame(train_label.transpose(1, 0), columns=list(labels.values()))

test_label = np.loadtxt('test-label.dat', unpack = True)
y_test = pd.DataFrame(test_label.transpose(1, 0), columns=list(labels.values()))

get all the words from each sentence in a line

In [5]:
tr_lines = []
with open("train-data.dat", 'r') as file:
 for line in file:
 
    l = "".join(re.split("\<|\>", line)[::2])
  
    tr_lines.append(re.sub("\s\s+" , " ", l.rstrip()[2:]))

te_lines = []
with open("test-data.dat", 'r') as file:
 for line in file:
 
    l = "".join(re.split("\<|\>", line)[::2])
    te_lines.append(re.sub("\s\s+" , " ", l.rstrip()[2:]))

In [6]:
vectorizer = CountVectorizer()
X = vectorizer.fit(tr_lines+te_lines)

X_train = vectorizer.transform(tr_lines)
X_test = vectorizer.transform(te_lines)

In [7]:
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

Binary relevance

In [10]:
model = MultiOutputRegressor(LinearRegression())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
print(classification_report(np.argmax(y_test.to_numpy(), axis=1),np.argmax(y_pred, axis=1),zero_division='warn'))


              precision    recall  f1-score   support

           0       0.31      0.10      0.15      1203
           1       0.09      0.07      0.07       197
           2       0.27      0.10      0.15      1029
           3       0.05      0.07      0.06        70
           4       0.11      0.09      0.10       316
           5       0.04      0.06      0.05       104
           6       0.09      0.10      0.09       302
           7       0.02      0.07      0.03       123
           8       0.07      0.12      0.09       182
           9       0.01      0.04      0.02        53
          10       0.04      0.19      0.06        73
          11       0.00      0.00      0.00        32
          12       0.03      0.05      0.04        58
          13       0.00      0.00      0.00         8
          14       0.06      0.06      0.06       100
          15       0.02      0.06      0.03        33
          16       0.00      0.00      0.00        20
          17       0.00    

In [8]:
model = MultiOutputRegressor(LogisticRegression(random_state=12))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
print(classification_report(np.argmax(y_test.to_numpy(), axis=1),np.argmax(y_pred, axis=1),zero_division='warn'))

              precision    recall  f1-score   support

           0       0.42      0.76      0.54      1203
           1       0.83      0.15      0.25       197
           2       0.49      0.34      0.40      1029
           3       0.33      0.04      0.08        70
           4       0.28      0.21      0.24       316
           5       0.28      0.07      0.11       104
           6       0.31      0.15      0.20       302
           7       0.18      0.22      0.19       123
           8       0.35      0.33      0.34       182
           9       0.18      0.08      0.11        53
          10       0.21      0.21      0.21        73
          11       0.12      0.12      0.12        32
          12       0.23      0.12      0.16        58
          13       0.00      0.00      0.00         8
          14       0.42      0.28      0.34       100
          15       0.20      0.12      0.15        33
          16       0.05      0.05      0.05        20
          17       0.19    

In [12]:
model = MultiOutputRegressor(SVC(kernel='linear'))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [13]:
print(classification_report(np.argmax(y_test.to_numpy(), axis=1),np.argmax(y_pred, axis=1),zero_division='warn'))

              precision    recall  f1-score   support

           0       0.46      0.76      0.57      1203
           1       0.83      0.25      0.39       197
           2       0.50      0.36      0.42      1029
           3       0.28      0.07      0.11        70
           4       0.24      0.20      0.22       316
           5       0.29      0.06      0.10       104
           6       0.37      0.15      0.22       302
           7       0.18      0.20      0.19       123
           8       0.33      0.36      0.34       182
           9       0.20      0.15      0.17        53
          10       0.21      0.25      0.23        73
          11       0.11      0.16      0.13        32
          12       0.22      0.16      0.18        58
          13       0.00      0.00      0.00         8
          14       0.39      0.36      0.37       100
          15       0.18      0.15      0.16        33
          16       0.09      0.10      0.10        20
          17       0.21    

Classifier Chain

In [19]:
model =  ClassifierChain(LogisticRegression(random_state=12, max_iter=150))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [16]:
print(classification_report(np.argmax(y_test.to_numpy(), axis=1),np.argmax(y_pred, axis=1),zero_division='warn'))

              precision    recall  f1-score   support

           0       0.39      0.81      0.53      1203
           1       0.83      0.15      0.26       197
           2       0.50      0.31      0.38      1029
           3       0.60      0.04      0.08        70
           4       0.31      0.17      0.22       316
           5       0.33      0.01      0.02       104
           6       0.31      0.17      0.22       302
           7       0.17      0.09      0.12       123
           8       0.33      0.31      0.32       182
           9       0.22      0.08      0.11        53
          10       0.20      0.16      0.18        73
          11       0.14      0.09      0.11        32
          12       0.32      0.12      0.17        58
          13       0.00      0.00      0.00         8
          14       0.41      0.29      0.34       100
          15       0.27      0.09      0.14        33
          16       0.09      0.05      0.06        20
          17       0.21    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
model = MultiOutputRegressor(SVC(kernel='linear'))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
print(classification_report(np.argmax(y_test.to_numpy(), axis=1),np.argmax(y_pred, axis=1),zero_division='warn'))

              precision    recall  f1-score   support

           0       0.46      0.76      0.57      1203
           1       0.83      0.25      0.39       197
           2       0.50      0.36      0.42      1029
           3       0.28      0.07      0.11        70
           4       0.24      0.20      0.22       316
           5       0.29      0.06      0.10       104
           6       0.37      0.15      0.22       302
           7       0.18      0.20      0.19       123
           8       0.33      0.36      0.34       182
           9       0.20      0.15      0.17        53
          10       0.21      0.25      0.23        73
          11       0.11      0.16      0.13        32
          12       0.22      0.16      0.18        58
          13       0.00      0.00      0.00         8
          14       0.39      0.36      0.37       100
          15       0.18      0.15      0.16        33
          16       0.09      0.10      0.10        20
          17       0.21    

Apart from the linear regression which was only used in binary relevance (precision in class 1 was 0.09 and all the others has 0.83), the results are about the same with some classes having a little better results than others in each case and some worse. Both methods could not find classes with few examples which could be fixed with imbalance methods which are out of the scope of this project.

It is difficult to describe which one is better just from this.
One thing we can say is that because we already have a lot of columns in this dataset it would be better to pick a method that does not add more, also if the classifier  chain method yields wrong results in the first classes, it might have a higher chance to predict the rest of the classes wrong, so for example the class 18 has a better accuracy in the binary relevance method.
For these reasons we think that its better to pick the binary relevance method for this dataset and the little preprossesing that has been done.

!!! Could not use the mill library to chech how to use the files

In [57]:
!pip install mil -U



In [58]:
# # importing all the datasets modules
# from mil.data.datasets import musk1, musk2, protein, elephant, corel_dogs, \
#                               ucsb_breast_cancer, web_recommendation_1, birds_brown_creeper, \
#                               mnist_bags
# from mil.bag_representation import MILESMapping, DiscriminativeMapping, ArithmeticMeanBagRepresentation, \
#                                    MedianBagRepresentation, GeometricMeanBagRepresentation, MinBagRepresentation, \
#                                    MaxBagRepresentation, MeanMinMaxBagRepresentation

In [59]:
# (bags_train, y_train), (bags_test, y_test) = ucsb_breast_cancer.load()

FileNotFoundError: ignored