# Notebook initialization

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np

In [None]:
def load_features(directory):
    au_features = pd.read_csv('{}/{}/audio_features.csv'.format('../data/output/features',directory), index_col=0)
    im_features = pd.read_csv('{}/{}/image_features.csv'.format('../data/output/features',directory), index_col=0)
    
    # Drop redundant columns
    im_features = im_features.drop(['label'], axis=1)

    # Merge audio and image features
    features = pd.concat([au_features, im_features], axis=1)

    # Only look at clips less than 300s long
    features = features[features.length < 300]
    
    return features

# Labeled audios

## Import data

In [None]:
features = load_features('train')
features.head()

## Data Preparation

In [None]:
# Split training and test set
from sklearn import cross_validation
columns = ['label', 'length', 'last_ring_to_end', 'percent_silence', 'ring_count', 'white_proportion']
train, test = cross_validation.train_test_split(features[columns], train_size=0.7, random_state=1000)
y_train = train['label']
X_train = train.drop('label', axis=1)
y_test = test['label']
X_test = test.drop('label', axis=1)

In [None]:
# Scale features
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

## Fit Models

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
list(zip(columns[1:], lr.coef_[0]))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=15, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
list(zip(columns[1:], rf.feature_importances_))

### Support Vector classifier

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', probability=True) # available kernels: linear, poly, rbf, sigmoid
svm.fit(X_train_scaled, y_train)

## Model Predictions on test set

In [None]:
## Predict on the test set
from sklearn import metrics
for m in [lr, rf, svm]:
    X_test_scaled = scaler.transform(X_test)
    y_pred = m.predict(X_test_scaled)
    # Area under the curve
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
    print(metrics.roc_auc_score(y_test, y_pred))

### Test each model with different metrics

In [None]:
from sklearn.pipeline import make_pipeline
# Features to use
columns = ['length', 'ring_count', 'last_ring_to_end', 'percent_silence', 'white_proportion']

X = features[columns]
y = features['label']

for m in [LogisticRegression(),RandomForestClassifier(n_estimators=20, n_jobs=-1), SVC(kernel='rbf')]:
    # First scale and then apply model
    clf = make_pipeline(preprocessing.StandardScaler(), m)
    print(m.__class__.__name__)
    
    # options for scoring: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    for scorer in ['roc_auc', 'average_precision', 'recall', 'f1']:
        scores = cross_validation.cross_val_score(clf, X, y, cv=10, scoring=scorer, n_jobs=-1)
        print("\t{}: {:.2f} (+/- {:.2f})".format(scorer, scores.mean(), scores.std() * 2))

# Unlabeled Audios

## Load data

In [None]:
unl_features = load_features('test')
unl_features.head()

## Clean data

In [None]:
# Get rid of label column
unl_features = unl_features.drop('label', axis=1)

## Check data

In [None]:
# Take a look at where nan values are
print(len(unl_features[unl_features.isnull().any(axis=1)]))
unl_features[unl_features.isnull().any(axis=1)].head()

In [None]:
# just drop the remaning rows with nan values
unl_features = unl_features.dropna()

## Reset index after cleanup

In [None]:
# reset index for concatenating predicted labels
unl_features = unl_features.reset_index(drop=True)
unl_features.head()

In [None]:
# Selected models 
models = [
    LogisticRegression(),
    RandomForestClassifier(n_estimators=20, n_jobs=-1),
    SVC(kernel='rbf', probability=True),
]

## Data preparation

In [None]:
# Features to use
columns = ['length', 'ring_count', 'last_ring_to_end', 'percent_silence', 'white_proportion']

y_train_all = features['label']
X_train_all = features[columns]

# Scale features
scaler = preprocessing.StandardScaler()
X_train_all_scaled = scaler.fit_transform(X_train_all)
unl_features_scaled = scaler.transform(unl_features[columns])

## Model predictions

In [None]:
# Balanced predicted labels 
labels_pred = []
# Probability distribution to minimize false positives (discarded)
proba_pred = []
# Fit model and predict for unlabeled data
for m in models:
    print('Training', m.__class__.__name__)
    m.fit(X_train_all_scaled, y_train_all)
    labels_pred.append(m.predict(unl_features_scaled))
    proba_pred.append(m.predict_proba(unl_features_scaled))

## Balanced labels analysis

In [None]:
for labels in labels_pred:
    print(np.unique(labels, return_counts=True))

### Combine predictions from models to assign the final labels

In [None]:
# Combine labels from each predictor into a matrix (one row per predictor)
agg_labels = np.vstack(labels_pred)
# Sum each column of labels
al = np.sum(agg_labels, axis=0)

# If at least two predictors predict "1", then "1", else "0"
al[np.where(al <= 1)] = 0
al[np.where(al > 1)] = 1
final_labels = al

In [None]:
np.unique(final_labels, return_counts=True)

## Unbalanced labels analysis

In [None]:
# False positive threshold
threshold = 0.9
labels_pred_unbalanced = []
for proba in proba_pred:
    labels_model = []
    for p0, p1 in proba:
        label = 1 if p1 >= threshold else 0
        labels_model.append(label)
    labels_pred_unbalanced.append(labels_model)

In [None]:
for labels in labels_pred_unbalanced:
    print(np.unique(labels, return_counts=True))

### Combine predictions from models to assign the final labels

In [None]:
# Combine labels from each predictor into a matrix (one row per predictor)
agg_labels = np.vstack(labels_pred_unbalanced)
# Sum each column of labels
al = np.sum(agg_labels, axis=0)

# If at least two predictors predict "1", then "1", else "0"
al[np.where(al <= 1)] = 0
al[np.where(al > 1)] = 1
final_labels_unb = al

In [None]:
np.unique(final_labels_unb, return_counts=True)

In [None]:
# Create a dataFrame for the predicted labels
pred_labels = pd.DataFrame(final_labels, columns=['pred_label'])
pred_labels_unb = pd.DataFrame(final_labels_unb, columns=['pred_label90'])
predicted_labels = pd.concat([pred_labels, pred_labels_unb], axis=1)
predicted_labels.head()

In [None]:
unlabeled_final = pd.concat([unl_features, predicted_labels], axis=1)
unlabeled_final.head()

# Export data

In [None]:
unlabeled_final.to_csv('../data/output/predicted/unlabeled_predicted.csv', index_label='index')