PREPARE THE WORKSPACE

In [10]:
import numpy as np
import pandas as pd


READ THE DATA

In [11]:
# the data is synthetic. I generated it in order to have data to work with
# it contains artworks created by the "Beautify Berlin" community inlcuding data about the type of artwork, the district it can be found in and further
# "Column1" contained the index in the *.txt file and was dropped since it would have been redundant

artworks = pd.read_table("beautified_boxes.txt").drop("Column1", axis=1)
artworks

Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating,approval
0,32048698,painting,Treptow-Kopenick,main street,2,first time,nothing,animals or plants,4.0,1
1,39800694,painting,Treptow-Kopenick,park,3,beginner,poster,political,2.0,0
2,80318972,graffiti,Treptow-Kopenick,public spot,3,beginner,stickers and tags,scenery,4.0,0
3,74478002,graffiti,Treptow-Kopenick,main street,4,first time,recent graffiti,cartoon or comical,4.0,0
4,71449602,painting,Steglitz-Zehlendorf,side street,3,beginner,poster,cartoon or comical,,0
...,...,...,...,...,...,...,...,...,...,...
745,68311402,painting,Tempelhof-Schoneberg,side street,2,first time,weathered graffiti,political,3.0,1
746,22390811,painting,Mitte,park,2,beginner,weathered graffiti,cartoon or comical,5.0,1
747,19230282,painting,Friedrichshain-Kreuzberg,park,1,advanced,weathered graffiti,people,5.0,1
748,29644044,painting,Treptow-Kopenick,side street,2,advanced,stickers and tags,scenery,3.0,1


PROCESS THE DATA

In [33]:
# some artworks have no user rating, thus the column "userRating" contains NaN in these cases
# dropping rows containing missing values would be possible, but less meaningful than filling with 0
# this is because during labeling, a missing user rating was a disadvantage in the process of approval and thus influenced the decision rather than just missing without any effect
# thus, fill the missing values with the numerical value 0 
# 0 is even lower in the continuous numerical ranking than 1, which is the lowest ranking possible

artworks_processed = artworks.fillna(0)
artworks_processed

Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating,approval
0,32048698,painting,Treptow-Kopenick,main street,2,first time,nothing,animals or plants,4.0,1
1,39800694,painting,Treptow-Kopenick,park,3,beginner,poster,political,2.0,0
2,80318972,graffiti,Treptow-Kopenick,public spot,3,beginner,stickers and tags,scenery,4.0,0
3,74478002,graffiti,Treptow-Kopenick,main street,4,first time,recent graffiti,cartoon or comical,4.0,0
4,71449602,painting,Steglitz-Zehlendorf,side street,3,beginner,poster,cartoon or comical,0.0,0
...,...,...,...,...,...,...,...,...,...,...
745,68311402,painting,Tempelhof-Schoneberg,side street,2,first time,weathered graffiti,political,3.0,1
746,22390811,painting,Mitte,park,2,beginner,weathered graffiti,cartoon or comical,5.0,1
747,19230282,painting,Friedrichshain-Kreuzberg,park,1,advanced,weathered graffiti,people,5.0,1
748,29644044,painting,Treptow-Kopenick,side street,2,advanced,stickers and tags,scenery,3.0,1


In [34]:
# currently, the categories are not numerical and not ordered
# transform the categories to continuous numerical by replacing values
# least likely to be approved gets 1
# most likely to be approved gets highest value (=number of options)
# remaining ones are ranked in between them according to probability of approval

# for column "type"
artworks_processed["type"] = artworks_processed["type"].map({
    "painting": 5,
    "graffiti": 4,
    "poster": 3,
    "stencil": 2,
    "text": 1
})

# for column "district"
artworks_processed["district"] = artworks_processed["district"].map({
    # districts in which approval is easier to get
    "Friedrichshain-Kreuzberg": 5,
    "Neukolln": 5,
    # neutral to easier districts
    "Lichtenberg": 4,
    "Marzahn-Hellersdorf": 4,
    "Tempelhof-Schoneberg": 4,
    # tue neutral
    "Pankow": 3,
    "Reinickendorf": 3,
    "Treptow-Kopenick": 3, 
    # neutral to harder districts
    "Spandau": 2,
    "Steglitz-Zehlendorf": 2,
    # districts in which approval is harder to get
    "Charlottenburg-Wilmersdorf": 1,
    "Mitte": 1
})

# for column "environment"
artworks_processed["environment"] = artworks_processed["environment"].map({
    "side street": 4,
    "main street": 3,
    "park": 3,
    "public spot": 2,
    "playground": 1
})

# for column "experience"
artworks_processed["experience"] = artworks_processed["experience"].map({
    "professional": 4,
    "advanced": 3,
    "beginner": 2,
    "first time": 1
})

# for column "replaced"
artworks_processed["replaced"] = artworks_processed["replaced"].map({
    "stickers and tags": 8,
    "nothing": 7,
    "weathered graffiti": 6,
    "text": 5,
    "poster": 4,
    "recent graffiti": 3,
    "weathered painting": 2,
    "recent painting": 1
})

# for column "content"
artworks_processed["content"] = artworks_processed["content"].map({
    "animals or plants": 2,
    "cartoon or comical": 2,
    "conceptual": 2,
    "people": 2,
    "scenery": 2,
    "political": 1
})

# have a look at current state of data frame
artworks_processed

# DISCLAIMER 
"""
This is synthetic data. I (Fabian Janosch Krueger), generated it myself and for the algorithm to be able to learn, I needed to introduce several biases.
These biases (like for example in which of the districts it is easiest to get approval of authorities for street art) does neither necessarily have to be true nor reflect
the opinions of TechLabs or Beautify Berlin.
It is just an examplatory data set enabling usage for machine learning.
"""

Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating,approval
0,32048698,5,3,3,2,1,7,2,4.0,1
1,39800694,5,3,3,3,2,4,1,2.0,0
2,80318972,4,3,2,3,2,8,2,4.0,0
3,74478002,4,3,3,4,1,3,2,4.0,0
4,71449602,5,2,4,3,2,4,2,0.0,0
...,...,...,...,...,...,...,...,...,...,...
745,68311402,5,4,4,2,1,6,1,3.0,1
746,22390811,5,1,3,2,2,6,2,5.0,1
747,19230282,5,5,3,1,3,6,2,5.0,1
748,29644044,5,3,4,2,3,8,2,3.0,1


In [56]:
# checking data types
for name, values in artworks_processed.iteritems():
    print(name, type(values.values.dtype))

# result: all are numerical, all except one are int64, one is float64.
# this should work fine, but I want all of the columns to have the same data type.

Artwork-Id <class 'numpy.dtype[int64]'>
type <class 'numpy.dtype[int64]'>
district <class 'numpy.dtype[int64]'>
environment <class 'numpy.dtype[int64]'>
countArtists <class 'numpy.dtype[int64]'>
experience <class 'numpy.dtype[int64]'>
replaced <class 'numpy.dtype[int64]'>
content <class 'numpy.dtype[int64]'>
userRating <class 'numpy.dtype[float64]'>
approval <class 'numpy.dtype[int64]'>


In [64]:
# change data type of "userRating" from float64 to int64
artworks_processed["userRating"] = artworks_processed["userRating"].astype(np.int64)

# repeat checking data types to see if changing worked
for name, values in artworks_processed.iteritems():
    print(name, type(values.values.dtype))

# changing worked, all columns now have same data type: int64
# processing of data frame is completed
# it can now be used as input for a machine learning algorithm

# have a final look at the processed data frame
artworks_processed

Artwork-Id <class 'numpy.dtype[int64]'>
type <class 'numpy.dtype[int64]'>
district <class 'numpy.dtype[int64]'>
environment <class 'numpy.dtype[int64]'>
countArtists <class 'numpy.dtype[int64]'>
experience <class 'numpy.dtype[int64]'>
replaced <class 'numpy.dtype[int64]'>
content <class 'numpy.dtype[int64]'>
userRating <class 'numpy.dtype[int64]'>
approval <class 'numpy.dtype[int64]'>


Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating,approval
0,32048698,5,3,3,2,1,7,2,4,1
1,39800694,5,3,3,3,2,4,1,2,0
2,80318972,4,3,2,3,2,8,2,4,0
3,74478002,4,3,3,4,1,3,2,4,0
4,71449602,5,2,4,3,2,4,2,0,0
...,...,...,...,...,...,...,...,...,...,...
745,68311402,5,4,4,2,1,6,1,3,1
746,22390811,5,1,3,2,2,6,2,5,1
747,19230282,5,5,3,1,3,6,2,5,1
748,29644044,5,3,4,2,3,8,2,3,1


EXPORT THE PROCESSED DATA FRAME TO TSV

In [65]:
# save the data as a *.tsv file
# to enable, remove the "#"
# make sure not to overwrite any previous files by changing the name
#artworks_processed.to_csv('beautified_boxes_numeric.txt', sep="\t")