# CV for multilabel classifer
## IML task 2, subtask 1

### Import libreries

In [1]:
import pandas as pd
import numpy as np
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import time

In [2]:
## Start timer
start_time = time.time()

### Read data

In [3]:
## We read the already normalized and imputed data 
train_feat_path = "../data/train_features_imp.csv" 
train_lab_path = "../data/train_labels.csv"
train_feat = pd.read_csv(train_feat_path)
train_lab = pd.read_csv(train_lab_path)

## Order data to make sure that rows in X and Y match
train_feat.sort_values(by=['pid'], inplace = True, ignore_index = True)
train_lab.sort_values(by=['pid'], inplace = True, ignore_index = True)

## Join data 
train_Y_X = pd.merge(train_lab, train_feat,
                     on='pid', 
                     how='left')

In [4]:
## Sample rows
train_Y_X = train_Y_X.sample(frac=0.10, random_state=123).reset_index(drop=True) 

In [5]:
## Select relevant label columns and exclude the pid column, and sample rows
Y_train = train_Y_X.iloc[:,1:10].values 
X_train = train_Y_X.iloc[:, 11:287].values

### Define clasifiers for CV search

In [6]:
parameters = {'classifier': [SVC(class_weight = 'balanced')],
              'classifier__kernel':('linear', 'rbf'), 
              'classifier__C':[1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}

clf = GridSearchCV(BinaryRelevance(require_dense = [False, True]), 
                   parameters, 
                   scoring='f1_micro',
                   refit = True, 
                   cv = 5)

In [None]:
clf.fit(X_train, Y_train)

### Check out Random Forest classification

In [None]:
parameters = {'classifier': [RandomForestClassifier()]}

clf_2 = GridSearchCV(BinaryRelevance(require_dense = [False, True]), 
                     parameters, 
                     scoring='f1_micro',
                     refit = False, 
                     cv = 5)

In [None]:
clf_2.fit(X_train, Y_train)

In [None]:
print("--- %s minutes ---" % ((time.time() - start_time)/60))