# Notebook initialization

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

In [2]:
def load_features(directory):
    au_features = pd.read_csv('{}/{}/audio_features.csv'.format('../data/output/features',directory), index_col=0)
    im_features = pd.read_csv('{}/{}/image_features.csv'.format('../data/output/features',directory), index_col=0)
    
    # Drop redundant columns
    im_features = im_features.drop(['label'], axis=1)

    # Merge audio and image features
    features = pd.concat([au_features, im_features], axis=1)

    # Only look at clips less than 300s long
    features = features[features.length < 300]
    
    return features

# Labeled audios

## Import data

In [3]:
features = load_features('train')
features.head()

Unnamed: 0_level_0,audio_file,image_file,length,label,percent_silence,ring_count,last_ring_to_end,white_proportion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,52.74,0,0.342474,0,52.704,0.8092
1,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,27.0,1,0.95818,0,26.928,0.977541
2,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,56.088,0,0.208273,0,56.016,0.843548
3,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,215.64,0,0.122387,0,215.568,0.810502
4,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,117.216,0,0.40766,2,107.154625,0.866311


## Data Preparation

In [4]:
# Split training and test set
from sklearn import cross_validation
columns = ['label', 'length', 'last_ring_to_end', 'percent_silence', 'ring_count', 'white_proportion']
train, test = cross_validation.train_test_split(features[columns], train_size=0.7, random_state=1000)
y_train = train['label']
X_train = train.drop('label', axis=1)
y_test = test['label']
X_test = test.drop('label', axis=1)

In [5]:
# Scale features
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

## Fit Models

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
list(zip(columns[1:], lr.coef_[0]))

[('length', -0.26393985742271975),
 ('last_ring_to_end', 0.031673473618075362),
 ('percent_silence', -0.16316215300369694),
 ('ring_count', 0.56038523425746611),
 ('white_proportion', 3.860985646925108)]

### Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=15, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
list(zip(columns[1:], rf.feature_importances_))

[('length', 0.20264792225402536),
 ('last_ring_to_end', 0.2615507810794967),
 ('percent_silence', 0.15750871922069676),
 ('ring_count', 0.013155674370576721),
 ('white_proportion', 0.36513690307520447)]

### Support Vector classifier

In [8]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', probability=True) # available kernels: linear, poly, rbf, sigmoid
svm.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Model Predictions on test set

In [9]:
## Predict on the test set
from sklearn import metrics
for m in [lr, rf, svm]:
    X_test_scaled = scaler.transform(X_test)
    y_pred = m.predict(X_test_scaled)
    # Area under the curve
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html
    print(metrics.roc_auc_score(y_test, y_pred))

0.90581544491
0.922379976842
0.918518693774


### Test each model with different metrics

In [10]:
from sklearn.pipeline import make_pipeline
# Features to use
columns = ['length', 'ring_count', 'last_ring_to_end', 'percent_silence', 'white_proportion']

X = features[columns]
y = features['label']

for m in [LogisticRegression(),RandomForestClassifier(n_estimators=20, n_jobs=-1), SVC(kernel='rbf')]:
    # First scale and then apply model
    clf = make_pipeline(preprocessing.StandardScaler(), m)
    print(m.__class__.__name__)
    
    # options for scoring: http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    for scorer in ['roc_auc', 'average_precision', 'recall', 'f1']:
        scores = cross_validation.cross_val_score(clf, X, y, cv=10, scoring=scorer, n_jobs=-1)
        print("\t{}: {:.2f} (+/- {:.2f})".format(scorer, scores.mean(), scores.std() * 2))

LogisticRegression
	roc_auc: 0.97 (+/- 0.02)
	average_precision: 0.96 (+/- 0.02)
	recall: 0.91 (+/- 0.03)
	f1: 0.90 (+/- 0.03)
RandomForestClassifier
	roc_auc: 0.97 (+/- 0.02)
	average_precision: 0.96 (+/- 0.02)
	recall: 0.90 (+/- 0.03)
	f1: 0.91 (+/- 0.02)
SVC
	roc_auc: 0.96 (+/- 0.02)
	average_precision: 0.95 (+/- 0.02)
	recall: 0.91 (+/- 0.03)
	f1: 0.91 (+/- 0.02)


# Unlabeled Audios

## Load data

In [11]:
unl_features = load_features('test')
unl_features.head()

Unnamed: 0_level_0,audio_file,image_file,length,label,percent_silence,ring_count,last_ring_to_end,white_proportion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,55.8,,0.376077,2,43.251875,0.869002
1,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,65.88,,0.357495,0,65.808,0.819014
2,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,9.576,,0.07222,0,9.504,0.95542
3,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.256,,0.421754,0,5.184,0.970955
4,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.76,,0.471123,0,5.688,0.97103


## Clean data

In [12]:
# Get rid of label column
unl_features = unl_features.drop('label', axis=1)

## Check data

In [13]:
# Take a look at where nan values are
print(len(unl_features[unl_features.isnull().any(axis=1)]))
unl_features[unl_features.isnull().any(axis=1)].head()

2


Unnamed: 0_level_0,audio_file,image_file,length,percent_silence,ring_count,last_ring_to_end,white_proportion
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9746,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,29.664,0.014239,0,29.592,
12398,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,88.56,0.020017,2,49.359125,


In [14]:
# just drop the remaning rows with nan values
unl_features = unl_features.dropna()

## Reset index after cleanup

In [15]:
# reset index for concatenating predicted labels
unl_features = unl_features.reset_index(drop=True)
unl_features.head()

Unnamed: 0,audio_file,image_file,length,percent_silence,ring_count,last_ring_to_end,white_proportion
0,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,55.8,0.376077,2,43.251875,0.869002
1,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,65.88,0.357495,0,65.808,0.819014
2,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,9.576,0.07222,0,9.504,0.95542
3,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.256,0.421754,0,5.184,0.970955
4,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.76,0.471123,0,5.688,0.97103


In [16]:
# Selected models 
models = [
    LogisticRegression(),
    RandomForestClassifier(n_estimators=20, n_jobs=-1),
    SVC(kernel='rbf', probability=True),
]

## Data preparation

In [17]:
# Features to use
columns = ['length', 'ring_count', 'last_ring_to_end', 'percent_silence', 'white_proportion']

y_train_all = features['label']
X_train_all = features[columns]

# Scale features
scaler = preprocessing.StandardScaler()
X_train_all_scaled = scaler.fit_transform(X_train_all)
unl_features_scaled = scaler.transform(unl_features[columns])

## Model predictions

In [18]:
# Balanced predicted labels 
labels_pred = []
# Probability distribution to minimize false positives (discarded)
proba_pred = []
# Fit model and predict for unlabeled data
for m in models:
    print('Training', m.__class__.__name__)
    m.fit(X_train_all_scaled, y_train_all)
    labels_pred.append(m.predict(unl_features_scaled))
    proba_pred.append(m.predict_proba(unl_features_scaled))

Training LogisticRegression
Training RandomForestClassifier
Training SVC


## Balanced labels analysis

In [19]:
for labels in labels_pred:
    print(np.unique(labels, return_counts=True))

(array([0, 1]), array([10579,  8391]))
(array([0, 1]), array([11127,  7843]))
(array([0, 1]), array([10995,  7975]))


### Combine predictions from models to assign the final labels

In [20]:
# Combine labels from each predictor into a matrix (one row per predictor)
agg_labels = np.vstack(labels_pred)
# Sum each column of labels
al = np.sum(agg_labels, axis=0)

# If at least two predictors predict "1", then "1", else "0"
al[np.where(al <= 1)] = 0
al[np.where(al > 1)] = 1
final_labels = al

In [21]:
np.unique(final_labels, return_counts=True)

(array([0, 1]), array([10901,  8069]))

## Unbalanced labels analysis

In [22]:
# False positive threshold
threshold = 0.9
labels_pred_unbalanced = []
for proba in proba_pred:
    labels_model = []
    for p0, p1 in proba:
        label = 1 if p1 >= threshold else 0
        labels_model.append(label)
    labels_pred_unbalanced.append(labels_model)

In [23]:
for labels in labels_pred_unbalanced:
    print(np.unique(labels, return_counts=True))

(array([0, 1]), array([14233,  4737]))
(array([0, 1]), array([12910,  6060]))
(array([0, 1]), array([12728,  6242]))


### Combine predictions from models to assign the final labels

In [24]:
# Combine labels from each predictor into a matrix (one row per predictor)
agg_labels = np.vstack(labels_pred_unbalanced)
# Sum each column of labels
al = np.sum(agg_labels, axis=0)

# If at least two predictors predict "1", then "1", else "0"
al[np.where(al <= 1)] = 0
al[np.where(al > 1)] = 1
final_labels_unb = al

In [26]:
np.unique(final_labels_unb, return_counts=True)

(array([0, 1]), array([13175,  5795]))

In [27]:
# Create a dataFrame for the predicted labels
pred_labels = pd.DataFrame(final_labels, columns=['pred_label'])
pred_labels_unb = pd.DataFrame(final_labels_unb, columns=['pred_label90'])
predicted_labels = pd.concat([pred_labels, pred_labels_unb], axis=1)
predicted_labels.head()

Unnamed: 0,pred_label,pred_label90
0,0,0
1,0,0
2,1,1
3,1,1
4,1,1


In [28]:
unlabeled_final = pd.concat([unl_features, predicted_labels], axis=1)
unlabeled_final.head()

Unnamed: 0,audio_file,image_file,length,percent_silence,ring_count,last_ring_to_end,white_proportion,pred_label,pred_label90
0,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,55.8,0.376077,2,43.251875,0.869002,0,0
1,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,65.88,0.357495,0,65.808,0.819014,0,0
2,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,9.576,0.07222,0,9.504,0.95542,1,1
3,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.256,0.421754,0,5.184,0.970955,1,1
4,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,/Users/jjelosua/Developer/lanacion/ML_nisman/d...,5.76,0.471123,0,5.688,0.97103,1,1


# Export data

In [29]:
unlabeled_final.to_csv('../data/output/predicted/unlabeled_predicted.csv', index_label='index')