# Creating a MOCK datasets
Author: Karina Condeixa



The process to create a fake dataset started by making dataframes about `users` and `item`, and further a complete dataframe that could be used in the `model`.

`numpy random` and `faker` were used to ccreate randomized series. Latitude_longitude points were picked using a polygons method, consideing four points choosen manually in Google Maps.



In [242]:
# import packages
import pandas as pd
from faker import Faker, providers
from faker.providers.address.de_DE import Provider as DeDeAddressProvider
from faker.generator import random
from faker.providers import BaseProvider
# import random

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# %matplotlib notebook
# %matplotlib 

import folium
# from geopy.geocoders import Nominatim

### Postcodes

In [243]:
# immport and clean original dataset removing poostcodes from Germanay, that are not in Berlin
postcodes_de = pd.read_excel(r'data/German-Zip-Codes.xlsx', sheet_name='Berlin')
df = pd.DataFrame(postcodes_de)
df.set_axis(["postcodes_berlin"], axis=1,inplace=True)
df = (df["postcodes_berlin"].str[8:-11])
df.to_csv('data/postcodes_berlin.csv', index=False)

  df.set_axis(["postcodes_berlin"], axis=1,inplace=True)


### Creating the postcode series

In [244]:
# import postcodes from Berlin and create a dataframe removing indexes and headers
postcodes_berlin = pd.read_csv('data/postcodes_berlin.csv')
print(postcodes_berlin)
postcodes_berlin_series = postcodes_berlin[:][1:].squeeze()

     postcodes_berlin
0               10117
1               10119
2               10178
3               10179
4               10243
..                ...
184             14169
185             14193
186             14195
187             14197
188             14199

[189 rows x 1 columns]


### Creating lists

In [245]:
# Importing Modules
import numpy as np
import random
# Use 'conda install shapely' to import the shapely library.
from shapely.geometry import Polygon, Point

num_records = 10

### Creating the datasets

In [246]:
# Note: # multi_locale_generator = Faker(['it_IT', 'en_US', 'de-DE', 'pt_BR', 'es-ES', 'fr-FR', 'ru-RU', 'tr-TR'])

# Instantiate Faker with multiple locales
german_locale_generator = Faker(['de_DE'])
fake = Faker()
Faker.seed(0)


### List of latitude and longitude co-ordinates 
Based on a sample of co-ordinates from [free data Berlin](https://daten.berlin.de/datensaetze/stra%C3%9Fenverkehrsunf%C3%A4lle-nach-unfallort-berlin-2021) for Road traffic accidents by accident location in Berlin 2021.


In [247]:
lat_lng_samples = pd.read_csv("data/lat_lng_samples.csv")
lat_lng_samples.head(2)

Unnamed: 0,lat,lng
0,5253393955,1342689483
1,5243995086,1339209027


In [248]:
# replace comma to dot
lat_lng_samples = lat_lng_samples.apply(lambda x: x.str.replace(',','.'))

In [249]:
list_lat = lat_lng_samples.lat.values.tolist()
list_lng = lat_lng_samples.lng.values.tolist()
lat_lng_samples.head(2)


Unnamed: 0,lat,lng
0,5.253.393.955,1.342.689.483
1,5.243.995.086,1.339.209.027


In [250]:
lat_lng_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11197 entries, 0 to 11196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lat     11192 non-null  object
 1   lng     11192 non-null  object
dtypes: object(2)
memory usage: 175.1+ KB


### Item dataset

In [251]:
# An initial classification that is not being used to normalize with UXs categories. But it is saved for future updates.
# category_item = ['furniture-sofa',
#                  'furniture-armchair',
#                  'furniture-chair',
#                  'furniture-table',
#                  'furniture-desk',
#                  'furniture-bed',
#                  'furniture-bookcase',
#                  'furniture-bedside_table',
#                  'furniture-cabinet',
#                  'furniture-wardrobe',
#                  'furniture-shelf',
#                  'furniture-cupboard',
#                  'furniture-rollcontainers',
#                  'furniture-shoe_rack',
#                  'furniture-mirror',
#                  'furniture-cot',
#                  'furniture-trolley',
#                  'appliance-washing_machine',
#                  'appliance-dish_washer',
#                  'appliance-drying_rack',
#                  'appliance-refrigerator',
#                  'appliance-blender',
#                  'appliance-extractor_hood',
#                  'appliance-clothes_iron',
#                  'appliance-vacuum_cleaner',
#                  'appliance-sandwich_maker',
#                  'appliance-kettle',
#                  'appliance-air_conditioner',
#                  'appliance-heater',
#                  'appliance-pan',
#                  'appliance-popcorn_maker',
#                  'appliance-coffee_machine',
#                  'appliance-stove',
#                  'lighting-lighting',
#                  'lighting-chandelier',
#                  'lighting-lightbulb',
#                  'musical_equipment-guitar',
#                  'musical_equipment-sound_amplifier',
#                  'musical_equipment-contrabass',
#                  'musical_equipment-battery',
#                  'musical_equipment-piano',
#                  'tech-desktop',
#                  'tech-laptop',
#                  'tech-phone',
#                  'tech-keyboard',
#                  'clothes-woman_jacket',
#                  'clothes-man_jacket',
#                  'clothes-child_jacket',
#                  'clothes-woman_clothes',
#                  'clothes-man_clothes',
#                  'clothes-child_clothes',
#                  'shoes-woman_shoes',
#                  'shoes-man_shoes',
#                  'shoes-child_shoes',
#                  'miscelaneaous-ironing_board',
#                  'miscelaneaous-picture_frame',
#                  'miscelaneaous-bicycle',
#                  'miscelaneaous-plant',
#                  'miscelaneaous-carpet',
#                  'miscelaneaous-roller_skates',
#                  'miscelaneaous-ski_skates',
#                  'miscelaneaous-books',
#                  'miscelaneaous-purse',
#                  'miscelaneaous-suitcase',
#                  'miscelaneaous-shopping_venture',
#                  'miscelaneaous-board',
#                  'miscelaneaous-frame',
#                  'home-mattress', 
#                  'home-carpet',
#                  'kids-stroller',
#                  'kids-baby_carriage']

In [252]:
item_status = [1,0]  # ['avaliable', 'not_available'] # CONDITION = 1
category = ['all', 'furniture',  'clothes',  'plants', 'kids'] # All – Furniture – Clothes – Plants - Kids
condition = ['3','2', '1'] # Poor – Good – Like new
# postcodes = postcodes_berlin_series
    
n_views = []
for i in range(201):  # until 200 views
    n_views.append(i)

pct_likes = []
for p in range(20):  # until 20% of the n_views
    pct_likes.append(p/100)
 
# post_date: something between the last 30 days - limit = '-30d'
# timer = (date.today()) - post_date


In [253]:
from datetime import date

# define a function to create item data
limit = '-30d'  # limit of 30 days of item in the app

random.seed(1000)

def create_item_data(num_records): 
  
    # dictionary 
    item ={} 
    for i in range(0, num_records): 
        item[i] = {}
#         item[i]['item_status'] = 1
        item[i]['item_category'] = np.random.choice(category)
        item[i]['item_condition'] = np.random.choice(condition)
        item[i]['item_postcode'] = np.random.choice(postcodes_berlin_series)
        item[i]['item_timer'] = (date.today()) - fake.date_between_dates(limit,'now')
        
#         item[i]['item_lat'] = np.random.choice(list_lat)
#         item[i]['item_lng'] = np.random.choice(list_lng)
        
#         datetime_iteration1 = fake.date_between_dates(limit,'now')
#         datetime_iteration2 = fake.date_between_dates(limit,'now')
#         if datetime_iteration1 <= datetime_iteration2:
#             item[i]['item_timer'] = datetime_iteration1
#             item[i]['item_timer'] = datetime_iteration2
#         else:
#             item[i]['item_timer'] = datetime_iteration2
#             item[i]['item_timer'] = datetime_iteration1  
 # This date shold be later than the post
        item[i]['n_views'] = np.random.choice(n_views)
        item[i]['n_likes'] = int(item[i]['n_views'] * (np.random.choice(pct_likes)))
             
    return item

In [254]:
item_df = pd.DataFrame(create_item_data(1500)).transpose()
item_df.head()

Unnamed: 0,item_category,item_condition,item_postcode,item_timer,n_views,n_likes
0,plants,3,10319,11 days,183,3
1,all,1,10587,10 days,19,0
2,clothes,2,10245,28 days,120,10
3,clothes,3,10551,17 days,59,0
4,kids,3,13469,5 days,8,0


In [255]:
item_df['item_id'] = item_df.index +1  # add item_id
item_id_series = item_df['item_id']  # storage in a variable to use later

item_df.head(10)

Unnamed: 0,item_category,item_condition,item_postcode,item_timer,n_views,n_likes,item_id
0,plants,3,10319,11 days,183,3,1
1,all,1,10587,10 days,19,0,2
2,clothes,2,10245,28 days,120,10,3
3,clothes,3,10551,17 days,59,0,4
4,kids,3,13469,5 days,8,0,5
5,all,3,10997,6 days,80,12,6
6,clothes,1,14167,10 days,148,7,7
7,clothes,2,12555,15 days,32,6,8
8,plants,1,12487,7 days,114,15,9
9,clothes,1,12207,13 days,23,0,10


### User

In [256]:
user_df = item_df.copy()

In [257]:
user_df = user_df.rename(columns={"item_id": "user_id"})

In [258]:
user_df.head(10)

Unnamed: 0,item_category,item_condition,item_postcode,item_timer,n_views,n_likes,user_id
0,plants,3,10319,11 days,183,3,1
1,all,1,10587,10 days,19,0,2
2,clothes,2,10245,28 days,120,10,3
3,clothes,3,10551,17 days,59,0,4
4,kids,3,13469,5 days,8,0,5
5,all,3,10997,6 days,80,12,6
6,clothes,1,14167,10 days,148,7,7
7,clothes,2,12555,15 days,32,6,8
8,plants,1,12487,7 days,114,15,9
9,clothes,1,12207,13 days,23,0,10


In [259]:
# item_df.get('item_timer')

item_df['item_timer'] = item_df['item_timer'].astype(str).str[:2].str.strip()
item_df['item_timer'] = item_df['item_timer'].astype(int)

item_df = item_df.rename(columns={'item_timer': 'item_timer_days'})

# item_df.drop('item_timer', axis=1)
item_df.info()

item_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   item_category    1500 non-null   object
 1   item_condition   1500 non-null   object
 2   item_postcode    1500 non-null   object
 3   item_timer_days  1500 non-null   int64 
 4   n_views          1500 non-null   object
 5   n_likes          1500 non-null   object
 6   item_id          1500 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 93.8+ KB


Unnamed: 0,item_category,item_condition,item_postcode,item_timer_days,n_views,n_likes,item_id
0,plants,3,10319,11,183,3,1
1,all,1,10587,10,19,0,2
2,clothes,2,10245,28,120,10,3
3,clothes,3,10551,17,59,0,4
4,kids,3,13469,5,8,0,5


In [260]:
# saves csv files
item_df.to_csv('data/item_df.csv', index=False)
# user_df.to_csv('data/user_df.csv', index=False)