# Sub-task 1 with Word2Vec

## Imports

In [1]:
%run import_data.ipynb
import pandas as pd
from sklearn.metrics import multilabel_confusion_matrix,recall_score, accuracy_score, hamming_loss, precision_score, f1_score
from utils.evaluation import evaluate_pipeline_x_validation
from embeddings.word2vec import Word2VecMean
from sklearn.feature_extraction.text import TfidfVectorizer

## Creating a pipeline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid

word2vec = Word2VecMean()

# creating a pipeline to run on each classifier.
pipeline = Pipeline([
    ('w2v', word2vec),
    ('clf', ClfSwitcher())
])

## Training Experiences

### Native Algorithms

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

#### Decision Tree

In [8]:
class_weigth = [None,'balanced']

estimators = [DecisionTreeClassifier(class_weight=weigth) for weigth in class_weigth]

grid = ParameterGrid({
    'clf__estimator': estimators,
})

models = ['unbalanced1', 'unbalanced2', 'balanced1', 'balanced2']

evaluate_pipeline_x_validation(pipeline, grid, models, X_train, y_train)

 50%|█████     | 2/4 [02:04<02:04, 62.18s/it]


Unnamed: 0,model,f1-micro
0,unbalanced1,0.327165
1,unbalanced2,0.294734


In [17]:
decision_tree_best = grid[0]

pipeline.set_params(**decision_tree_best)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"F1-micro score for DecisionTree is {f1_score(y_test, y_pred, average='micro', zero_division=0)}")
print(f"F1-macro score for DecisionTree is {f1_score(y_test, y_pred, average='macro', zero_division=0)}")

F1-micro score for DecisionTree is 0.29752066115702475
F1-macro score for DecisionTree is 0.10214419814585282


#### Extra trees

In [18]:
max_features = [None,5,10,20]

estimators = [ExtraTreeClassifier(max_features=feat) for feat in max_features]

grid = ParameterGrid({
    'clf__estimator': estimators
})

models = ['ExtraMaxNone', 'ExtraMax5', 'ExtraMax10', 'ExtraMax20']

evaluate_pipeline_x_validation(pipeline, grid, models, X_train, y_train)

100%|██████████| 4/4 [03:29<00:00, 52.26s/it]


Unnamed: 0,model,f1-micro
0,ExtraMaxNone,0.307583
1,ExtraMax5,0.301104
2,ExtraMax10,0.302966
3,ExtraMax20,0.300559


In [26]:
extra_tree_best = grid[0]

pipeline.set_params(**extra_tree_best)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"F1-micro score for Extra Trees is {f1_score(y_test, y_pred, average='micro', zero_division=0)}")
print(f"F1-macro score for Extra Trees is {f1_score(y_test, y_pred, average='macro', zero_division=0)}")

F1-micro score for Extra Trees is 0.2581560283687943
F1-macro score for Extra Trees is 0.07578959353720192


#### KNN


In [27]:
n_neighbors = [5,10,20,50]

estimators = [KNeighborsClassifier(n_neighbors=n) for n in n_neighbors]

grid = ParameterGrid({
    'clf__estimator': estimators
})

models = ['Knn5_', 'Knn10_', 'Knn20_', 'Knn50_']

evaluate_pipeline_x_validation(pipeline, grid, models, X_train, y_train)

100%|██████████| 4/4 [03:45<00:00, 56.36s/it]


Unnamed: 0,model,f1-micro
0,Knn5_,0.447417
1,Knn10_,0.443302
2,Knn20_,0.454261
3,Knn50_,0.457072


In [29]:
knn_best = grid[2]

pipeline.set_params(**knn_best)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"F1-micro score for KNN is {f1_score(y_test, y_pred, average='micro', zero_division=0)}")
print(f"F1-macro score for KNN is {f1_score(y_test, y_pred, average='macro', zero_division=0)}")

F1-micro score for KNN is 0.4451718494271686
F1-macro score for KNN is 0.07417863362450243


### Binary Relevance

In [4]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

classifier = [BinaryRelevance(classifier = DecisionTreeClassifier()),
              BinaryRelevance(classifier = ExtraTreeClassifier()),
              BinaryRelevance(classifier = KNeighborsClassifier(n_neighbors=20)),
              BinaryRelevance(classifier = GaussianNB())]

grid = ParameterGrid({
    'clf__estimator': classifier
})

models = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8']

evaluate_pipeline_x_validation(pipeline, grid, models, X_train, y_train)

 50%|█████     | 4/8 [04:23<04:23, 65.76s/it]


Unnamed: 0,model,f1-micro
0,m1,0.316847
1,m2,0.307348
2,m3,0.454261
3,m4,0.379484


In [5]:
from sklearn.metrics import precision_score

for index, params in enumerate(grid, start=1):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"F1-micro score for BinaryRelevance with model M{index} is {f1_score(y_test, y_pred, average='micro', zero_division=0)}")
    print(f"F1-macro score for BinaryRelevance with model M{index} is {f1_score(y_test, y_pred, average='macro', zero_division=0)}")
    class_recall = precision_score(y_true=y_test, y_pred=y_pred, average=None, zero_division=0)
    print(class_recall)

F1-micro score for BinaryRelevance with model M1 is 0.2949547218628719
F1-macro score for BinaryRelevance with model M1 is 0.09874350751654899
[0.         0.0625     0.         0.         0.         0.0952381
 0.16       0.06666667 0.         0.56481481 0.         0.34375
 0.         0.         0.         0.         0.125      0.27536232
 0.         0.0952381 ]
F1-micro score for BinaryRelevance with model M2 is 0.32697547683923706
F1-macro score for BinaryRelevance with model M2 is 0.09604054198755371
[0.         0.         0.         0.         0.         0.15
 0.13043478 0.         0.         0.60576923 0.         0.34090909
 0.         0.         0.         0.         0.11764706 0.32692308
 0.         0.11764706]
F1-micro score for BinaryRelevance with model M3 is 0.4451718494271686
F1-macro score for BinaryRelevance with model M3 is 0.07417863362450243
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.59615385 0.         0.40625
 0.    

### Label Powerset

In [6]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

classifier = [LabelPowerset(classifier = DecisionTreeClassifier()),
              LabelPowerset(classifier = KNeighborsClassifier(n_neighbors=20)),
              LabelPowerset(classifier = GaussianNB())]

grid = ParameterGrid({
    'clf__estimator': classifier
})


models = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6']

evaluate_pipeline_x_validation(pipeline, grid, models, X_train, y_train)

 50%|█████     | 3/6 [02:47<02:47, 55.82s/it]


Unnamed: 0,model,f1-micro
0,m1,0.32741
1,m2,0.450436
2,m3,0.424347


In [8]:
for index, params in enumerate(grid, start=1):
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"F1-micro score for Label Powerset with model M{index} is {f1_score(y_test, y_pred, average='micro', zero_division=0)}")
    print(f"F1-macro score for Label Powerset with model M{index} is {f1_score(y_test, y_pred, average='macro', zero_division=0)}")
    class_recall = precision_score(y_true=y_test, y_pred=y_pred, average=None, zero_division=0)
    print(class_recall)

F1-micro score for Label Powerset with model M1 is 0.3046153846153846
F1-macro score for Label Powerset with model M1 is 0.08990277050464493
[0.         0.16666667 0.         0.         0.         0.07142857
 0.06666667 0.         0.         0.6344086  0.         0.24242424
 0.         0.         0.         0.         0.16666667 0.25757576
 0.         0.3       ]
F1-micro score for Label Powerset with model M2 is 0.41486068111455116
F1-macro score for Label Powerset with model M2 is 0.08276372750056961
[0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.60305344 0.         0.34482759
 0.         0.         0.         0.         0.         0.27272727
 0.         0.        ]
F1-micro score for Label Powerset with model M3 is 0.3999999999999999
F1-macro score for Label Powerset with model M3 is 0.09062047932232711
[0.         0.         0.         0.         0.         0.42857143
 0.5        0.         0.         0.65625    0.         0.375
 0.  