In [81]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [82]:
import sys
sys.path.append("../")
from config import DATASETS_PATH

In [83]:
LABELS_PATH = os.path.join(DATASETS_PATH, 'labels.csv')
labels_df = pd.read_csv(LABELS_PATH)

In [84]:
labels_df.head() # view format

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


### Identify and fix errors

In [85]:
# first replace all empty string with NaN
labels_df = labels.replace(r'^\s*$', np.nan, regex=True)

In [86]:
# test number of non-empty image names matches number of training images
TRAIN_IMGS_PATH = os.path.join(DATASETS_PATH, 'train')
n_images = len(os.listdir(TRAIN_IMGS_PATH))
n_image_names = pd.notnull(labels_df["image_name"]).sum()
assert n_images == n_image_names

In [87]:
# test whether an image is labelled twice
n_duplicated = labels_df["image_name"].duplicated().sum()
assert n_duplicated == 0

In [88]:
# check if any image does not have assigned labels
n_empty = pd.isnull(labels_df["tags"]).sum()
assert n_empty == 0

### Labels

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [96]:
# labels in the dataset
vectorizer.fit(labels_df['tags'].to_list())
labels = vectorizer.get_feature_names()
print(labels)

['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water']


### Reformat dataset
replace single column of tags to multiple columns with indicator variables for each tag (label)

In [125]:
indicator_values = vectorizer.transform(labels_df['tags'])
indicator_df = pd.DataFrame(data=indicator_values.todense(), columns=labels)
indicator_df = pd.concat([labels_df["image_name"], indicator_df], axis="columns")
indicator_df.head()

Unnamed: 0,image_name,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,train_0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,train_1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,train_2,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,train_3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,train_4,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0


### Label distribution

In [126]:
# sum of rows and plot histogram

### Co-occurrance matrix

In [127]:
# look up how this is achieved