In [3]:
from fastavro import parse_schema


schema_ind = {
    'doc': 'Spotify Wrapped Data',
    'name': 'song_played',
    'namespace': 'spotify.schema.user.simulation',
    'type': 'record',
    'fields': [
        {'name': 'id_song_played', 'type': 'int'},
        {'name': 'artist_song_played', 'type': 'string'},
        {'name': 'song_played', 'type': 'string'},
        {'name': 'datetime_utc', 'type': 'int'},
        {'name': 'latitude', 'type': 'string' },
        {'name': 'longitude', 'type': 'string' },
        {'name': 'times_stopped', 'type': 'float'},
        {'name': 'skipped', 'type': 'boolean'},
        {'name': 'num_replay', 'type': 'int'},
        {'name': 'saved', 'type': 'boolean'},
        {'name': 'time_started', 'type': 'boolean'},
        {'name': 'time_ended', 'type': 'boolean'}
    ],
}

parsed_schema_ind = parse_schema(schema_ind)

parsed_schema_ind

{'type': 'record',
 'doc': 'Spotify Wrapped Data',
 'name': 'spotify.schema.user.simulation.location',
 'fields': [{'name': 'id_song_played', 'type': 'int'},
  {'name': 'artist_song_played', 'type': 'string'},
  {'name': 'song_played', 'type': 'string'},
  {'name': 'datetime_utc', 'type': 'int'},
  {'name': 'latitude', 'type': 'string'},
  {'name': 'longitude', 'type': 'string'},
  {'name': 'times_stopped', 'type': 'float'},
  {'name': 'skipped', 'type': 'boolean'},
  {'name': 'num_replay', 'type': 'int'},
  {'name': 'saved', 'type': 'boolean'}],
 '__fastavro_parsed': True,
 '__named_schemas': {'spotify.schema.user.simulation.location': {'type': 'record',
   'doc': 'Spotify Wrapped Data',
   'name': 'spotify.schema.user.simulation.location',
   'fields': [{'name': 'id_song_played', 'type': 'int'},
    {'name': 'artist_song_played', 'type': 'string'},
    {'name': 'song_played', 'type': 'string'},
    {'name': 'datetime_utc', 'type': 'int'},
    {'name': 'latitude', 'type': 'string'},
 

getting the tracks data 

In [5]:
import pandas as pd
csv_file_path = 'tracks.csv'

# Read the CSV file into a DataFrame
songs_spotify = pd.read_csv(csv_file_path)

In [6]:
len(songs_spotify)

586672

In [7]:
songs_spotify.head(5)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


simulating a single user 

In [9]:
%%time
import uuid
import datetime
import random
from faker import Faker
fake = Faker()
# Faker.seed(2000) # Use only to generate the same data set

def location_record(listen_songs):
    
    index_song_data = fake.pyint(min_value=0, max_value=len(songs_spotify)-1, step=1)
    
    record_ind = {
        u'id_song_played': songs_spotify["id"].loc[index_song_data],
        u'song_played': songs_spotify["name"].loc[index_song_data],
        u'artist_song_played': songs_spotify["artists"].loc[index_song_data],
        u'datetime_utc': datetime.datetime.utcnow().timestamp(),
        u'latitude': float(fake.latitude()),
        u'longitude': float(fake.longitude()),
        u'times_stopped': fake.pyint(min_value=0, max_value=10, step=1),
        u'skipped': random.choice([True, False]),
        u'num_replay': fake.pyint(min_value=0, max_value=10, step=1),
        u'saved':  random.choice([True, False]),
    }
    
    return record_ind

# Assuming location_record takes a person ID as an argument
person_id = fake.pyint(min_value=12904, max_value=78672, step=1)
records_ind = [location_record(person_id) for _ in range(20, 100)]


len(records_ind)

CPU times: total: 141 ms
Wall time: 651 ms


80

In [11]:
top_3_records_generated = records_ind[0:3]
top_3_records_generated

[{'id_song_played': '6qoHBab3uXaz0qCXm04iTQ',
  'song_played': 'Hacelo por Mí',
  'artist_song_played': "['Attaque 77']",
  'datetime_utc': 1707220850.997519,
  'latitude': 26.7258005,
  'longitude': 174.726319,
  'times_stopped': 8,
  'skipped': True,
  'num_replay': 6,
  'saved': True},
 {'id_song_played': '2jObIfKqejcReAjcmyb3Nf',
  'song_played': 'Ég meina',
  'artist_song_played': "['Aron Can']",
  'datetime_utc': 1707220850.997519,
  'latitude': -73.656651,
  'longitude': -171.830913,
  'times_stopped': 8,
  'skipped': True,
  'num_replay': 3,
  'saved': True},
 {'id_song_played': '0NPECfHDF2sbM2yLFm7JH0',
  'song_played': 'You You You - Stereo; 2008 Remaster',
  'artist_song_played': "['Gerry & The Pacemakers']",
  'datetime_utc': 1707220850.997519,
  'latitude': 71.606634,
  'longitude': -3.939422,
  'times_stopped': 3,
  'skipped': True,
  'num_replay': 2,
  'saved': False}]

In [13]:
from fastavro import parse_schema


schema_group = {
    'doc': 'Spotify Wrapped Data',
    'name': 'song_played',
    'namespace': 'spotify.schema.user.simulation',
    'type': 'record',
    'fields': [
        {'name': 'id_user', 'type': 'int'},
        {'name': 'id_song_played', 'type': 'int'},
        {'name': 'artist_song_played', 'type': 'string'},
        {'name': 'song_played', 'type': 'string'},
        {'name': 'datetime_utc', 'type': 'int'},
        {'name': 'latitude', 'type': 'string' },
        {'name': 'longitude', 'type': 'string' },
        {'name': 'times_stopped', 'type': 'float'},
        {'name': 'skipped', 'type': 'boolean'},
        {'name': 'num_replay', 'type': 'int'},
        {'name': 'saved', 'type': 'boolean'},
        #{'name': 'time_started', 'type': 'boolean'},
        #{'name': 'time_ended', 'type': 'boolean'}
    ],
}

parsed_schema_group = parse_schema(schema_group)

parsed_schema_group

{'type': 'record',
 'doc': 'Spotify Wrapped Data',
 'name': 'spotify.schema.user.simulation.location',
 'fields': [{'name': 'id_user', 'type': 'int'},
  {'name': 'id_song_played', 'type': 'int'},
  {'name': 'artist_song_played', 'type': 'string'},
  {'name': 'song_played', 'type': 'string'},
  {'name': 'datetime_utc', 'type': 'int'},
  {'name': 'latitude', 'type': 'string'},
  {'name': 'longitude', 'type': 'string'},
  {'name': 'times_stopped', 'type': 'float'},
  {'name': 'skipped', 'type': 'boolean'},
  {'name': 'num_replay', 'type': 'int'},
  {'name': 'saved', 'type': 'boolean'}],
 '__fastavro_parsed': True,
 '__named_schemas': {'spotify.schema.user.simulation.location': {'type': 'record',
   'doc': 'Spotify Wrapped Data',
   'name': 'spotify.schema.user.simulation.location',
   'fields': [{'name': 'id_user', 'type': 'int'},
    {'name': 'id_song_played', 'type': 'int'},
    {'name': 'artist_song_played', 'type': 'string'},
    {'name': 'song_played', 'type': 'string'},
    {'name':

for the group simulation

In [14]:
%%time
import uuid
import datetime
import random
from faker import Faker
fake = Faker()
# Faker.seed(2000) # Use only to generate the same data set


def location_record_group(user_id):
    
    index_song_data = fake.pyint(min_value=0, max_value=len(songs_spotify)-1, step=1)
    
    record_group = {
        u'id_user': user_id,
        u'id_song_played': songs_spotify["id"].loc[index_song_data],
        u'song_played': songs_spotify["name"].loc[index_song_data],
        u'artist_song_played': songs_spotify["artists"].loc[index_song_data],
        u'datetime_utc': datetime.datetime.utcnow().timestamp(),
        u'latitude': float(fake.latitude()),
        u'longitude': float(fake.longitude()),
        u'times_stopped': fake.pyint(min_value=0, max_value=10, step=1),
        u'skipped': random.choice([True, False]),
        u'num_replay': fake.pyint(min_value=0, max_value=10, step=1),
        u'saved':  random.choice([True, False]),
    }
    
    return record_group

listen_songs_group =[{ 'user_id': str(uuid.uuid1()), 'num_songs': random.randint(1, 20)} for _ in range(0,10000) ]
records_group = [ location_record_group(user['user_id']) for _ in range(0, random.randint(1,100)) for user in listen_songs_group ]

len(records_group)

CPU times: total: 10.2 s
Wall time: 10.9 s


220000

In [15]:
top_3_records_generated = records_ind[0:3]
top_3_records_generated

[{'id_song_played': '6qoHBab3uXaz0qCXm04iTQ',
  'song_played': 'Hacelo por Mí',
  'artist_song_played': "['Attaque 77']",
  'datetime_utc': 1707220850.997519,
  'latitude': 26.7258005,
  'longitude': 174.726319,
  'times_stopped': 8,
  'skipped': True,
  'num_replay': 6,
  'saved': True},
 {'id_song_played': '2jObIfKqejcReAjcmyb3Nf',
  'song_played': 'Ég meina',
  'artist_song_played': "['Aron Can']",
  'datetime_utc': 1707220850.997519,
  'latitude': -73.656651,
  'longitude': -171.830913,
  'times_stopped': 8,
  'skipped': True,
  'num_replay': 3,
  'saved': True},
 {'id_song_played': '0NPECfHDF2sbM2yLFm7JH0',
  'song_played': 'You You You - Stereo; 2008 Remaster',
  'artist_song_played': "['Gerry & The Pacemakers']",
  'datetime_utc': 1707220850.997519,
  'latitude': 71.606634,
  'longitude': -3.939422,
  'times_stopped': 3,
  'skipped': True,
  'num_replay': 2,
  'saved': False}]

SESSION STRUCTURE

In [104]:
from datetime import datetime, timedelta, timezone
import random
import numpy as np

In [117]:
sesh_schema_ind = {
    'doc': 'Spotify Wrapped Data',
    'name': 'session',
    'namespace': 'spotify.schema.user.simulation',
    'type': 'record',
    'fields': [
        {'name': 'user_id', 'type': 'string' },
        {'name': 'id_songs_played', 'type': {'name': 'id_songs_played', 'type': 'array', 'items': 'string'}},
        {'name': 'artists_song_played', 'type': {'name': 'artists_song_played', 'type': 'array', 'items': 'string'}},
        {'name': 'songs_played', 'type': {'name': 'songs_played', 'type': 'array', 'items': 'string'}},
        {'name': 'latitude', 'type': 'float' },
        {'name': 'longitude', 'type': 'float' },
        {'name': 'times_stopped', 'type': 'int'},
        {'name': 'skipped', 'type': 'int'},
        {'name': 'num_replay', 'type': 'int'},
        {'name': 'saved', 'type': 'int'},
        {'name': 'session_time_started', 'type': 'int'},
        {'name': 'session_time_ended', 'type': 'int'}
    ],
}

In [118]:
parsed_sesh_schema_ind = parse_schema(sesh_schema_ind)

parsed_sesh_schema_ind

{'type': 'record',
 'doc': 'Spotify Wrapped Data',
 'name': 'spotify.schema.user.simulation.session',
 'fields': [{'name': 'user_id', 'type': 'string'},
  {'name': 'id_songs_played', 'type': {'type': 'array', 'items': 'string'}},
  {'name': 'artists_song_played',
   'type': {'type': 'array', 'items': 'string'}},
  {'name': 'songs_played', 'type': {'type': 'array', 'items': 'string'}},
  {'name': 'latitude', 'type': 'float'},
  {'name': 'longitude', 'type': 'float'},
  {'name': 'times_stopped', 'type': 'int'},
  {'name': 'skipped', 'type': 'int'},
  {'name': 'num_replay', 'type': 'int'},
  {'name': 'saved', 'type': 'int'},
  {'name': 'session_time_started', 'type': 'int'},
  {'name': 'session_time_ended', 'type': 'int'}],
 '__fastavro_parsed': True,
 '__named_schemas': {'spotify.schema.user.simulation.session': {'type': 'record',
   'doc': 'Spotify Wrapped Data',
   'name': 'spotify.schema.user.simulation.session',
   'fields': [{'name': 'user_id', 'type': 'string'},
    {'name': 'id_so

In [110]:
def location_record(listen_songs, n_session_songs, time_of_day_start):

    list_of_indexes = []
    for i in range(n_session_songs):
        list_of_indexes.append(fake.pyint(min_value=0, max_value=len(songs_spotify)-1, step=1))

    song_durations = [songs_spotify["duration_ms"].loc[i] for i in list_of_indexes]
    song_durations = int(sum(song_durations))

    latitude = float(fake.latitude())
    longitude = float(fake.longitude())
    time_zone = TimezoneFinder().timezone_at(lat=latitude, lng=longitude)
    
    record_ind = {
        u'user_id': listen_songs,
        u'id_songs_played': [songs_spotify["id"].loc[i] for i in list_of_indexes],
        u'songs_played': [songs_spotify["name"].loc[i] for i in list_of_indexes],
        u'artist_songs_played': [songs_spotify["artists"].loc[i] for i in list_of_indexes],
        u'latitude': latitude,
        u'longitude': longitude,
        u'timezone': time_zone,
        u'times_stopped': fake.pyint(min_value=0, max_value=10, step=1),
        u'skipped': fake.pyint(min_value=0, max_value=10, step=1),
        u'num_replay': fake.pyint(min_value=0, max_value=10, step=1),
        u'saved': fake.pyint(min_value=0, max_value=10, step=1),
        u'session_time_started': time_of_day_start,
        u'session_time_ended': time_of_day_start + timedelta(milliseconds=song_durations)
    }
    
    return record_ind

# Assuming location_record takes a person ID as an argument
records_ind = []

for i in range(20, 100):
    person_id = fake.pyint(min_value=12904, max_value=78672, step=1)
    n_of_songs = fake.pyint(min_value=1, max_value=100, step=1)
    time_of_day = fake.date_time_between(start_date = datetime(2006, 4, 23), end_date = datetime.now())
    records_ind.append(location_record(person_id, n_of_songs, time_of_day))


len(records_ind)

80

In [111]:
top_3_records_generated = records_ind[0:3]
top_3_records_generated

[{'user_id': 64950,
  'id_songs_played': ['3VCWJexK5DhSo4FdvBwTLs',
   '0krrJiog8dCs2JhQxASxFu',
   '63HE42Dm8pJHuPuRFQ665b',
   '1gtxKytcUAv8ZKET0CYHY3',
   '0jZ9AdPDOzqHLS5wucCNrC',
   '4lQNhsnj55C67O7SKXWAMC',
   '3uHNvPdZjfcciXXB1sPH9A',
   '2dCLB9t0kl3WP0BfUcZI62',
   '6au6Hv9BgciDSxHx85k7ut',
   '2UTYXcT65CtEfLHWy2ODXR',
   '2cZtX94TdAdC5w0tdAXLY6',
   '1RYbMD60ReqByGWC0IRCh4',
   '4BrqTapIghu9mWQhl1ZAdZ',
   '5R2rXpISHhE7IKZuKMri5K',
   '7ij6mE8unSiAw8vWlZ0Vzs',
   '7zKiVR70T8iAaWdFO33BSo',
   '7c2h4jzcyJsN1QzTb8ZpiZ',
   '4u6rcmlhDDgS0YAo8ZMI4a',
   '4TtkcCPD4SdVPhu2yFobMp',
   '4r8e2JrMLbqtlH3Yf2Thx2',
   '3Fbc4WaUbA797hY1ZpskXs',
   '1C5eXdxiGK6unIvON7UDPb',
   '6elBJYzQzUePvoM0AkDIhd',
   '31c37xfmgHkAAqiDzNTln8',
   '3gkijt6bxk8ts6S2RCEu4n',
   '6OwkEO86D09EkIp1bfYaQt',
   '7JxTp4HrI99hr6L3VMCZbj'],
  'songs_played': ["Zeg 'ns Meisje",
   'Hay Un Tren A Las Cinco',
   'Sevdik de noldu sanki',
   'Heureux qui comme Ulysse',
   'Bosquito',
   '087 - Wolfsgesicht - Teil 28',
 

## Test Grounds

In [88]:
type(datetime(2024, 1, 1))


datetime.datetime

In [76]:
timedelta(milliseconds = 10000)

datetime.timedelta(seconds=10)

In [77]:
datetime.now() + timedelta(milliseconds = 10000)

datetime.datetime(2024, 2, 6, 15, 18, 33, 439982)

In [96]:
datetime.utcnow().timestamp()

1707228018.981335

In [100]:
print(datetime.now().tzinfo)

None


In [105]:
timezone.utc

datetime.timezone.utc

In [107]:
import pytz
pytz.all_timezones

['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau', 'Africa/Blantyre', 'Africa/Brazzaville', 'Africa/Bujumbura', 'Africa/Cairo', 'Africa/Casablanca', 'Africa/Ceuta', 'Africa/Conakry', 'Africa/Dakar', 'Africa/Dar_es_Salaam', 'Africa/Djibouti', 'Africa/Douala', 'Africa/El_Aaiun', 'Africa/Freetown', 'Africa/Gaborone', 'Africa/Harare', 'Africa/Johannesburg', 'Africa/Juba', 'Africa/Kampala', 'Africa/Khartoum', 'Africa/Kigali', 'Africa/Kinshasa', 'Africa/Lagos', 'Africa/Libreville', 'Africa/Lome', 'Africa/Luanda', 'Africa/Lubumbashi', 'Africa/Lusaka', 'Africa/Malabo', 'Africa/Maputo', 'Africa/Maseru', 'Africa/Mbabane', 'Africa/Mogadishu', 'Africa/Monrovia', 'Africa/Nairobi', 'Africa/Ndjamena', 'Africa/Niamey', 'Africa/Nouakchott', 'Africa/Ouagadougou', 'Africa/Porto-Novo', 'Africa/Sao_Tome', 'Africa/Timbuktu', 'Africa/Tripoli', 'Africa/Tunis', 'Africa/Windhoek', 'Ameri

In [108]:
from timezonefinder import TimezoneFinder

In [109]:
TimezoneFinder().timezone_at(lat=-56.9787885, lng=-61.932259)

'Etc/GMT+4'

In [116]:
songs_spotify["artists"].loc[i]

"['Maria Konopnicka']"

In [139]:
datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S%z")

'2024-02-06T20:44:07'

In [143]:
# Timezone string
timezone_str = 'America/New_York'

# Get the timezone object
timezone_obj = pytz.timezone(timezone_str)

# Get the current UTC offset in hours
utc_offset_hours = timezone_obj.utcoffset(datetime.now()).seconds
utc_offset_hours = utc_offset_hours // 3600

print(f"The UTC offset for {timezone_str} is {utc_offset_hours} hours.")

The UTC offset for America/New_York is 19 hours.


In [146]:
# Getting the local timezone
localTimeZone = pytz.timezone('CET')
# Getting the UTC timeZone
utcTimeZone = datetime.now(pytz.utc)
# format string
format = '%Y:%m:%dT%H:%M:%S%z'
# Convert the time to the local timezone
local = utcTimeZone.astimezone(localTimeZone)
# Getting formatted time using strftime() function
print("Formatted DateTime in Local Timezone : ",local.strftime(format))
print("Formatted DateTime in UTC Timezone : ",utcTimeZone.strftime(format))
difference = int(local.strftime('%z'))
difference2 = int(utcTimeZone.strftime('%z'))

Formatted DateTime in Local Timezone :  2024:02:06T21:52:08+0100
Formatted DateTime in UTC Timezone :  2024:02:06T20:52:08+0000


In [152]:
localTimeZone = pytz.timezone('Etc/GMT+6')
local = utcTimeZone.astimezone(localTimeZone)
local.strftime(format)

'2024:02:06T14:52:08-0600'

In [151]:
ll = datetime.now().astimezone(localTimeZone)
ll.strftime(format)

'2024:02:06T16:02:43-0500'