Library Import

In [30]:
from fastavro import parse_schema
import pandas as pd
import uuid
from datetime import datetime, timedelta, timezone
import random
from faker import Faker
from timezonefinder import TimezoneFinder
import pytz

In [6]:
fake = Faker()

In [14]:
songs_spotify = pd.read_csv('tracks.csv')

Schema

In [35]:
schema_ind = {
    'doc': 'Spotify Wrapped Data',
    'name': 'song_played',
    'namespace': 'spotify.schema.user.simulation',
    'type': 'record',
    'fields': [
        {'name': 'user_id', 'type': 'int'},
        {'name': 'id_song_played', 'type': 'string'},
        {'name': 'song_played', 'type': 'string'},
        {'name': 'artist_song_played', 'type': 'string'},
        {'name': 'timezone', 'type': 'string'},
        {'name': 'latitude', 'type': 'float' },
        {'name': 'longitude', 'type': 'float' },
        {'name': 'saved', 'type': 'boolean'},
        {'name': 'time_started', 'type': 'int'},
        {'name': 'time_ended', 'type': 'int'}
    ],
}

parsed_schema_ind = parse_schema(schema_ind)

parsed_schema_ind

{'type': 'record',
 'doc': 'Spotify Wrapped Data',
 'name': 'spotify.schema.user.simulation.song_played',
 'fields': [{'name': 'user_id', 'type': 'int'},
  {'name': 'id_song_played', 'type': 'string'},
  {'name': 'song_played', 'type': 'string'},
  {'name': 'artist_song_played', 'type': 'string'},
  {'name': 'timezone', 'type': 'string'},
  {'name': 'latitude', 'type': 'float'},
  {'name': 'longitude', 'type': 'float'},
  {'name': 'saved', 'type': 'boolean'},
  {'name': 'time_started', 'type': 'int'},
  {'name': 'time_ended', 'type': 'int'}],
 '__fastavro_parsed': True,
 '__named_schemas': {'spotify.schema.user.simulation.song_played': {'type': 'record',
   'doc': 'Spotify Wrapped Data',
   'name': 'spotify.schema.user.simulation.song_played',
   'fields': [{'name': 'user_id', 'type': 'int'},
    {'name': 'id_song_played', 'type': 'string'},
    {'name': 'song_played', 'type': 'string'},
    {'name': 'artist_song_played', 'type': 'string'},
    {'name': 'timezone', 'type': 'string'},
 

In [37]:
def location_record(user_id):
    
    index_song_data = fake.pyint(min_value=0, max_value=len(songs_spotify)-1, step=1)

    latitude = float(fake.latitude())
    longitude = float(fake.longitude())
    time_zone_str = TimezoneFinder().timezone_at(lat=latitude, lng=longitude)
    localTimeZone = pytz.timezone(time_zone_str)
    format = '%Y:%m:%dT%H:%M:%S%z'
    

    length = songs_spotify["duration_ms"].loc[index_song_data]
    length = int(length)
    
    record_ind = {
        u'user_id': user_id,
        u'id_song_played': songs_spotify["id"].loc[index_song_data],
        u'song_played': songs_spotify["name"].loc[index_song_data],
        u'artist_song_played': songs_spotify["artists"].loc[index_song_data],
        u'timezone': time_zone_str,
        u'latitude': latitude,
        u'longitude': longitude,
        u'saved':  random.choice([True, False]),
        u'time_started': datetime.now().astimezone(localTimeZone).strftime(format),
        u'time_ended': (datetime.now() + timedelta(milliseconds=length)).astimezone(localTimeZone).strftime(format)
    }
    
    return record_ind

# Assuming location_record takes a person ID as an argument
person_id = fake.pyint(min_value=12904, max_value=78672, step=1)
records_ind = [location_record(person_id) for _ in range(20, 100)]

records_ind = []
for i in range(100):
    person_id = fake.pyint(min_value=12904, max_value=78672, step=1)
    records_ind.append(location_record(person_id))


len(records_ind)

100

In [38]:
top_3_records_generated = records_ind[0:3]
top_3_records_generated

[{'user_id': 20375,
  'id_song_played': '1gAISR5OU9facKUrDBNEjR',
  'song_played': 'Pramínek Vlasů',
  'artist_song_played': "['Miroslav Donutil']",
  'timezone': 'Etc/GMT+7',
  'latitude': -67.5513365,
  'longitude': -99.134842,
  'saved': False,
  'time_started': '2024:02:06T14:17:23-0700',
  'time_ended': '2024:02:06T14:20:37-0700'},
 {'user_id': 56679,
  'id_song_played': '7rZuTJW3MbyDRn7IehOgrQ',
  'song_played': 'הדייגים',
  'artist_song_played': "['Erez Halevi']",
  'timezone': 'Etc/GMT-11',
  'latitude': -7.857203,
  'longitude': 170.612544,
  'saved': False,
  'time_started': '2024:02:07T08:17:24+1100',
  'time_ended': '2024:02:07T08:20:58+1100'},
 {'user_id': 54275,
  'id_song_played': '5fuLxRzNzweWQJU0MhpxJI',
  'song_played': 'Bana Bir Gül Ver',
  'artist_song_played': "['Onur Akın']",
  'timezone': 'Etc/GMT+8',
  'latitude': -78.6789775,
  'longitude': -126.675599,
  'saved': True,
  'time_started': '2024:02:06T13:17:24-0800',
  'time_ended': '2024:02:06T13:21:33-0800'}]