# Naïve baseline (based on curated data only)

Estimate probabilities of labels based on their overall frequency in the dataset only.

In [107]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from lwlwrap import calculate_overall_lwlrap_sklearn

from sklearn.preprocessing import MultiLabelBinarizer

## Import data

In [108]:
curated_df = pd.read_csv("../input/freesound-audio-tagging-2019/train_curated.csv")
#noisy_df = pd.read_csv("../input/freesound-audio-tagging-2019/train_noisy.csv")

sample_df = pd.read_csv("../input/freesound-audio-tagging-2019/sample_submission.csv")

df = pd.concat([curated_df])

## Encode labels

In [109]:
mlb = MultiLabelBinarizer()
true_labels = mlb.fit_transform(df['labels'].str.split(","))
all_classes = mlb.classes_

## Sanity checking of lwlwrap

The overall lwlwrap score for a perfect submission should be 1:

In [110]:
calculate_overall_lwlrap_sklearn(true_labels, true_labels)

1.0

The overall lwlwrap for a submission of all 0:s ought to be low:

In [111]:
calculate_overall_lwlrap_sklearn(true_labels, np.zeros_like(true_labels))

0.016294332406120632

## Calculate probablities

In [112]:
label_means = np.mean(true_labels, axis=0)
predicted_labels = np.repeat([label_means], len(df), axis=0)

## Estimate baseline score

In [113]:
calculate_overall_lwlrap_sklearn(true_labels, predicted_labels)

0.019273728461877405

Slight improvement over all 0:s.

## Generate baseline submission

In [114]:
submission_labels = np.repeat([label_means], len(sample_df), axis=0)
submission = pd.DataFrame(submission_labels)
submission.columns = mlb.classes_
submission.insert(0, 'fname', sample_df['fname'])

In [115]:
submission.to_csv("submission.csv", index=False)