# Data exploration and visualisation

In [None]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import train_test_split

In [117]:
# READ THE DATA in as pandas table - categories are character strings
artworks_character = pd.read_table("beautified_boxes.txt", index_col="Column1")
artworks_character.index.name = None
artworks_character


Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating,approval
0,32048698,painting,Treptow-Kopenick,main street,2,first time,nothing,animals or plants,4.0,1
1,39800694,painting,Treptow-Kopenick,park,3,beginner,poster,political,2.0,0
2,80318972,graffiti,Treptow-Kopenick,public spot,3,beginner,stickers and tags,scenery,4.0,0
3,74478002,graffiti,Treptow-Kopenick,main street,4,first time,recent graffiti,cartoon or comical,4.0,0
4,71449602,painting,Steglitz-Zehlendorf,side street,3,beginner,poster,cartoon or comical,,0
...,...,...,...,...,...,...,...,...,...,...
745,68311402,painting,Tempelhof-Schoneberg,side street,2,first time,weathered graffiti,political,3.0,1
746,22390811,painting,Mitte,park,2,beginner,weathered graffiti,cartoon or comical,5.0,1
747,19230282,painting,Friedrichshain-Kreuzberg,park,1,advanced,weathered graffiti,people,5.0,1
748,29644044,painting,Treptow-Kopenick,side street,2,advanced,stickers and tags,scenery,3.0,1


In [None]:
# READ THE DATA - categories are replaced by numeric values
# numeric categories are needed for some plots
artworks_numeric = pd.read_table("beautified_boxes_numeric.txt", index_col="Unnamed: 0")

In [None]:
# SUMMARY STATISTICS - NUMERIC VALUES ("countArtists", "userRating", "approval")

# use the describe() function to get a set of preselected summary statistics
# only applied to the naturally numeric values (like user rating), but not to the categories which needed to be transformed to numeric categories (like type)
summary = artworks_character.loc[:, ["countArtists", "userRating", "approval"]].describe()
print(summary, "\n")

# print some additional summary statistics and interpretations
print("In total,", artworks_character["countArtists"].sum(), "people have applied for", artworks_character["Artwork-Id"].count(), "beautifications.")
print("Of all applications,", (summary.loc["count", "userRating"] / artworks_character["Artwork-Id"].count()), "% had a user rating and", (summary.loc["mean", "approval"] * 100), "% have been approved by the authorities.")

In [118]:
# SUMMARY STATISTICS - NON NUMERIC VALUES ("type", "district", "environment", "experience", "replaced", "content")
# TYPE
# pie or bar chart



Index(['Artwork-Id', 'type', 'district', 'environment', 'countArtists',
       'experience', 'replaced', 'content', 'userRating', 'approval'],
      dtype='object')

In [None]:
# SCATTER PLOT MATRIX
# SET FEATURES AND LABEL, THEN DO TRAIN-TEST-SPLIT

# set features and label
# features (X): "type", "district", "environment", "countArtists", "experience", "replaced", "content", "userRating"
# label (y): "approval"
# do not use for classification: "Artwork_Id"
X = artworks_numeric[["type", "district", "environment", "countArtists", "experience", "replaced", "content", "userRating"]]
y = artworks_numeric["approval"]

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#get color map
cmap = cm.get_cmap("bwr")
# plot the scatter plot matrix
scatter = pd.plotting.scatter_matrix(X_train, marker=".",  s=130, hist_kwds={'bins':15}, figsize=(30, 30), cmap=cmap, c=y_train)

# LEGEND:
# data points are colored according to probability/ratio of approval
# color scale is continuous from blue (not approved) to red (approved)
# the "bluer" a data point is, the less likely was the approval and vice versa

# data points are distinctly separated from another
# this is due to the nature of the categories: numerical and equidistant, only natural numbers
# most data points are either blue or red. Those are the cases in which approval was consistent and a classifier is likely to predict correctly
# some data points, however, are in between. Those are the cases in which approval was not consistent and which account for a major part of the inaccuracy of the classifier

# all in all, the data seems to allow usage of a knn classifier