# Generating mock data of artwork beautified electric boxes

In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# function to generate mock data of artworks 
# each row/case is an artwork having attributes like type, district, content
# attributes are generated as a non-uniform random sample from a list
def mock_data(count):
    
    # attribute lists to take a non-uniform random sample from
    type_list = ["painting", "graffiti", "stencil", "poster", "text"]
    district_list = ["Mitte", "Friedrichshain-Kreuzberg", "Pankow", "Charlottenburg-Wilmersdorf", "Spandau", "Steglitz-Zehlendorf", "Tempelhof-Schöneberg", "Neukölln", "Treptow-Köpenick", "Marzahn-Hellersdorf", "Lichtenberg", "Reinickendorf"]
    environment_list = ["park", "main street", "side street", "public spot", "playground"]
    countArtists_list = [1, 2, 3, 4, 5]
    experience_list = ["first time", "beginner", "advanced", "professional"]
    replaced_list = ["nothing", "stickers and tags", "poster", "weathered graffiti", "recent graffiti", "weathered painting", "recent painting", "text"]
    content_list = ["political", "conceptual", "cartoon or comical", "scenery", "people", "animals or plants"]
    userRating_list = [1, 2, 3, 4, 5, None] 

    # main functional body of the function
    # repeated for each case
    # draws attributes from the lists above and saves each case in a list
    # p=(...) specifies probabilities
    for i in range(count):
        temporary_df = [{'Artwork-Id':np.random.randint(10000000, 99999999),
            "type": np.random.choice(type_list, p=[0.5, 0.3, 0.025, 0.15, 0.025]),
            "district": np.random.choice(district_list),
            "environment": np.random.choice(environment_list, p=(0.15, 0.3, 0.35, 0.15, 0.05)),
            "countArtists": np.random.choice(countArtists_list, p=(0.2, 0.3, 0.25, 0.2, 0.05)),
            "experience": np.random.choice(experience_list, p=(0.3, 0.35, 0.25 , 0.1)),
            "replaced": np.random.choice(replaced_list, p=(0.13, 0.2, 0.125, 0.22, 0.125, 0.1, 0.05, 0.05)),
            "content": np.random.choice(content_list)
     }]

        # simulate user ratings 
        # the more experience the artist has, the higher the probability to have a high community rating
        # always included 10% chance to get a missing value (None), standing for artwork that has not been rated yet, in order to be able to do some missing value handling
        if temporary_df[0].get("experience") == "first time":
            temporary_df[0]["userRating"] = np.random.choice(userRating_list, p=(0.1, 0.2, 0.35, 0.2, 0.05, 0.1))
        elif temporary_df[0].get("experience") == "beginner":
            temporary_df[0]["userRating"] = np.random.choice(userRating_list, p=(0.1, 0.1, 0.3, 0.3, 0.1, 0.1))
        elif temporary_df[0].get("experience") == "advanced":
            temporary_df[0]["userRating"] = np.random.choice(userRating_list, p=(0.05, 0.15, 0.2, 0.25, 0.25, 0.1))
        else:
            temporary_df[0]["userRating"] = np.random.choice(userRating_list, p=(0.05, 0.1, 0.15, 0.25, 0.35, 0.1))
        
        # change data type from list to pandas data frame
        temporary_panda = pd.DataFrame(temporary_df)
    
        # first loop: initiate a data frame to contain complete data. first case becomes first row of complete data frame
        if i == 0:
            complete_df = temporary_panda
        # from second loop on: each case becomes a new row of the complete data frame
        else:
            complete_df = pd.concat([complete_df, temporary_panda]).reset_index(drop = True)
    # return complete data frame
    return complete_df

In [3]:
# call the function to create mock data
# specify number of cases by passing as argument
artworks_df = pd.DataFrame(mock_data(1000))

In [4]:
# have a look at the first 10 cases
artworks_df.head(100)

Unnamed: 0,Artwork-Id,type,district,environment,countArtists,experience,replaced,content,userRating
0,45638987,poster,Lichtenberg,main street,3,beginner,weathered graffiti,scenery,3
1,75445782,painting,Reinickendorf,park,1,beginner,weathered painting,political,1
2,52882023,stencil,Reinickendorf,side street,1,beginner,weathered painting,political,4
3,16431944,poster,Spandau,side street,1,advanced,text,cartoon or comical,5
4,56813974,poster,Tempelhof-Schöneberg,side street,3,beginner,weathered graffiti,political,4
...,...,...,...,...,...,...,...,...,...
95,56672694,painting,Spandau,side street,5,first time,weathered painting,people,5
96,29740620,graffiti,Steglitz-Zehlendorf,side street,4,first time,weathered graffiti,scenery,
97,18145756,painting,Reinickendorf,playground,4,first time,weathered graffiti,cartoon or comical,3
98,69089036,graffiti,Tempelhof-Schöneberg,side street,1,first time,nothing,political,4


In [7]:
# save the data as a *.tsv file
# make sure the data is right, then remove the # to enable saving
# also make sure not to overwrite any previous files by changing the name
#artworks_df.to_csv('beautified_boxes.tsv', sep="\t")