In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

In [None]:
import sys
sys.path.append("../")
from config import DATASETS_PATH, ASSETS_PATH, COLOURS

In [None]:
MY_ASSETS_PATH = os.path.join(ASSETS_PATH, 'data_exploration_and_analysis')
LABELS_PATH = os.path.join(DATASETS_PATH, 'raw_train_validation_labels.csv')
labels_df = pd.read_csv(LABELS_PATH)

In [None]:
labels_df.head() # view format

### Identify and fix errors

In [None]:
# first replace all empty string with NaN
labels_df = labels_df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# test number of non-empty image names matches number of training images
TRAIN_IMGS_PATH = os.path.join(DATASETS_PATH, 'train')
VALIDATION_IMGS_PATH = os.path.join(DATASETS_PATH, 'validation')
n_images = len(os.listdir(TRAIN_IMGS_PATH)) + len(os.listdir(VALIDATION_IMGS_PATH))
n_image_names = pd.notnull(labels_df["image_name"]).sum()
assert n_images == n_image_names

In [None]:
# test whether an image is labelled twice
n_duplicated = labels_df["image_name"].duplicated().sum()
assert n_duplicated == 0

In [None]:
# check if any image does not have assigned labels
n_empty = pd.isnull(labels_df["tags"]).sum()
assert n_empty == 0

### Labels

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
# labels in the dataset
vectorizer.fit(labels_df['tags'].to_list())
labels = vectorizer.get_feature_names()
print(labels)

### Reformat dataset
replace single column of tags to multiple columns with indicator variables for each tag (label)

In [None]:
# first remove any possible duplicates
def remove_string_duplicates(s):
    return ' '.join((set(s.split())))

labels_df['tags'] = labels_df['tags'].apply(remove_string_duplicates)

In [None]:
indicator_values = vectorizer.transform(labels_df['tags'])
indicator_df = pd.DataFrame(data=indicator_values.todense(), columns=labels)
indicator_df = pd.concat([labels_df, indicator_df], axis="columns")

In [None]:
indicator_df.head()

### Label distribution

In [None]:
label_count = indicator_df[labels].sum(axis = 0)
label_count.sort_values(ascending=False, inplace=True)
sorted_labels = label_count.index.tolist()

In [None]:
fig_label_distribution = plt.figure(figsize=(20, 12))
plt.bar(sorted_labels, label_count, color=COLOURS['blue'])
plt.xticks(rotation=45)
plt.xlabel('Labels')
plt.ylabel('Frequency')

In [None]:
label_distribution_path = os.path.join(MY_ASSETS_PATH, 'label_distribution.png')
fig_label_distribution.savefig(label_distribution_path, dpi=300)

identify percentage of common and rare labels

In [None]:
common_labels = ['agriculture', 'road', 'water', 'cultivation', 'habitation', 'bare_ground']
rare_labels = ['selective_logging', 'artisinal_mine', 'blooming', 'slash_burn', 'blow_down', 'conventional_mine']

def count_proportion_with_label(indicator_df, labels):
    total = len(indicator_df.index)
    with_label = (indicator_df[labels].sum(axis=1) > 0).sum()
    return round(with_label / total * 100, 2)

In [None]:
percent_common_labels = count_proportion_with_label(indicator_vals_df, common_labels)
print(f'{percent_common_labels} percent of the dataset has one of the following labels: {", ".join(common_labels)}')
      
percent_rare_labels = count_proportion_with_label(indicator_vals_df, rare_labels)
print(f'{percent_rare_labels} percent of the dataset has one of the following labels: {", ".join(rare_labels)}')

output sorted indicator labels df to csv for future use

In [None]:
#sort df
sorted_columns = ['image_name', 'tags'] + sorted_labels
df_indicator_sorted = indicator_df[sorted_columns]

In [None]:
#export
indicator_labels_path = os.path.join(DATASETS_PATH, 'train_validation_labels.csv')
df_indicator_sorted.to_csv(indicator_labels_path, index=False)

### Label realtions

In [None]:
indicator_vals_df = indicator_df[sorted_labels]
label_coocurrance = indicator_vals_df.T.dot(indicator_vals_df)

In [None]:
label_coocurrance

In [None]:
blue_palette = sns.light_palette(COLOURS['blue'], 10)
sns.set(rc={'figure.figsize':(16,12)})
fig_coocurrance = sns.heatmap(label_coocurrance, cmap=blue_palette)

In [None]:
label_coocurrance_path = os.path.join(MY_ASSETS_PATH, 'label_coocurrance.png')
fig_coocurrance.get_figure().savefig(label_coocurrance_path, dpi=300)

Because the primary and clear labels are so dominant over the rest the regular co-ocurrance matrix does not provide us much information on the relation between the labels. We can expose more information by crating a new matrix which shows **what percentage of the label on the Y axis also has the label on the X axis**

In [None]:
label_relations = (label_coocurrance.T / label_count).T

In [None]:
fig_relations = sns.heatmap(label_relations, cmap=blue_palette)

In [None]:
label_relations_path = os.path.join(MY_ASSETS_PATH, 'label_relations.png')
fig_relations.get_figure().savefig(label_relations_path, dpi=300)