In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append('../')

from sklearn.externals import joblib
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score

import pandas as pd
import numpy as np


import paths

In [None]:
labels = ['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water']

In [None]:
train_data = joblib.load(os.path.join(paths.DATA_FOLDER, 'vgg_glob_avg_pool_features_train.pkl'))

In [None]:
train_csv = pd.read_csv(os.path.join(paths.DATA_FOLDER, 'train.csv'))
train_labels = train_csv.tags.str.split(' ').apply(lambda l: [labels.index(e) for e in l]).values
train_labels = MultiLabelBinarizer().fit_transform(train_labels)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2)

In [None]:
clf = OneVsRestClassifier(LogisticRegression(), n_jobs=-2)
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict_proba(X_val)

In [None]:
train_preds = clf.predict_proba(X_train)

In [None]:
# https://www.kaggle.com/anokas/fixed-f2-score-in-python/comments/code
def f2_score(y_true, y_pred):
    # fbeta_score throws a confusing error if inputs are not numpy arrays
    y_true, y_pred, = np.array(y_true), np.array(y_pred)
    # We need to use average='samples' here, any other average method will generate bogus results
    return fbeta_score(y_true, y_pred, beta=2, average='samples')

In [None]:
for threshold in [0.08, 0.1, 0.2, 0.3, 0.4, 0.5]:
    print threshold, f2_score(y_val, preds >= threshold)

In [None]:
for threshold in [0.1, 0.2, 0.3, 0.4, 0.5]:
    print threshold, f2_score(y_train, train_preds >= threshold)