
# Multi-label Classification

This examples shows how to format the targets for a multilabel classification
problem. Details on multilabel classification can be found
`here <https://scikit-learn.org/stable/modules/multiclass.html>`_.


In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.datasets
import sklearn.metrics
from sklearn.utils.multiclass import type_of_target
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import shap

# import autosklearn.classification

## Data Loading



In [2]:
# Using reuters multilabel dataset -- https://www.openml.org/d/40594
# X, y = sklearn.datasets.fetch_openml(data_id=40594, return_X_y=True, as_frame=False)

# read input data from file
df = pd.read_csv('https://gis-bucket-aswinvk28.s3.eu-west-2.amazonaws.com/adp/dataset/input_features.csv')

# read target values from file
target = pd.read_csv('https://gis-bucket-aswinvk28.s3.eu-west-2.amazonaws.com/adp/dataset/target_values.csv')

# Merge feature and target variables.
join_df = pd.merge(df, target, on='building_id', how='left')
# join_df.head(5)

# fetch openml downloads a numpy array with TRUE/FALSE strings. Re-map it to
# integer dtype with ones and zeros
# This is to comply with Scikit-learn requirement:
# "Positive classes are indicated with 1 and negative classes with 0 or -1."
# More information on: https://scikit-learn.org/stable/modules/multiclass.html

# Creating attribute set for geographical attributes
geographical_attributes = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
# Creating attribute set for numerical measures
numerical_measures = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']
# Creating attribute set for main categorical data involving building and land characteristics
main_building_land_attributes = ['ground_floor_type', 'other_floor_type', 'legal_ownership_status', 'plan_configuration']
# Creating attribute set for sub categorical data involving building and land characteristics
sub_building_land_attributes = ['land_surface_condition', 'foundation_type', 'roof_type', 'position']
# Creating attribute set for superstructure construction attributes
superstructure_attributes = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other']
# Creating attribute set for secondary usage attributes
secondary_usage_attributes = ['has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other']

# assigning category dtype to categorical variables
join_df = join_df.astype({x: 'category' for x in main_building_land_attributes})
join_df = join_df.astype({x: 'category' for x in sub_building_land_attributes})
# assigning category dtype for target variable
join_df = join_df.astype({'damage_grade': 'category'})
# assigning int32 for numerical measures
join_df = join_df.astype({x: 'int32' for x in numerical_measures})
# assigning int32 for geo level attributes
join_df = join_df.astype({x: 'int32' for x in geographical_attributes})

# Using type of target is a good way to make sure your data
# is properly formatted
# print(f"type_of_target={type_of_target(y)}")

X_train, X_test, y_train, y_test = train_test_split(
    pd.get_dummies(join_df.drop(columns=['damage_grade'])), join_df['damage_grade'], random_state=1
)

## Building the classifier



In [3]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

## View the models found by auto-sklearn



In [4]:
print(rf.score(X_test, y_test))

0.7186382403953891


## Print the final ensemble constructed by auto-sklearn



In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(pd.get_dummies(join_df.drop(columns=['damage_grade'])))

## Print statistics about the auto-sklearn run



In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar')

## Get the Score of the final ensemble



In [None]:
shap.dependence_plot("age", shap_values, X_test)