#### Generating unbiased user interaction dataset

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

users = pd.read_csv('spotify_user_dataset.csv')
songs = pd.read_csv('spotify_song_attributes.csv')

interactions = []

np.random.seed(42)
n_interactions_per_user = 50

for _, user in users.iterrows():
    preferred_genres = user['preferred_genres'].split(',')
    eligible_songs = songs[songs['genre'].isin(preferred_genres)]
    other_songs = songs[~songs['genre'].isin(preferred_genres)]
    
    for _ in range(np.random.poisson(n_interactions_per_user)):
        if np.random.random() < 0.7 and len(eligible_songs) > 0:
            track = eligible_songs.sample(1).iloc[0]
            skip_prob = 0.3
        else:
            track = other_songs.sample(1).iloc[0]
            skip_prob = 0.7
        
        interactions.append({
            'user_id': user['user_id'],
            'track_id': track['track_id'],
            'listen_count': np.random.poisson(2) + 1,
            'skip': np.random.random() < skip_prob,
            'timestamp': datetime.now() - timedelta(days=np.random.randint(1, 180))
        })

interactions_df = pd.DataFrame(interactions)
interactions_df.to_csv('unbiased_user_interactions.csv', index=False)

KeyboardInterrupt: 

#### Modifying unbiased spotify user dataset

In [6]:
from collections import defaultdict

unbiased_spotify_user_dataset = pd.read_csv('unbiased_spotify_user_dataset.csv')

artist_genre_mapping = {
    # Pop
    "Britney Spears": "Pop",
    "Taylor Swift": "Pop",
    "Lauv": "Pop",
    "Ed Sheeran": "Pop",
    "Adele": "Pop",
    "Selena Gomez": "Pop",
    "Dua Lipa": "Pop",
    "Ariana Grande": "Pop",
    "Olivia Rodrigo": "Pop",
    "Maroon 5": "Pop",
    "Justin Bieber": "Pop",
    "Billie Eilish": "Pop",
    "Rita Ora": "Pop",
    
    # Jazz
    "Nina Simone": "Jazz",
    "Miles Davis": "Jazz",
    "John Coltrane": "Jazz",
    "Herbie Hancock": "Jazz",
    "Louis Armstrong": "Jazz",
    
    # Electronic
    "Calvin Harris": "Electronic",
    "Martin Garrix": "Electronic",
    "Zedd": "Electronic",
    "Marshmello": "Electronic",
    "Avicii": "Electronic",
    "David Guetta": "Electronic",
    "Tiësto": "Electronic",
    "Kygo": "Electronic",
    "Deadmau5": "Electronic",
    
    # Rock
    "Linkin Park": "Rock",
    "Coldplay": "Rock",
    "Imagine Dragons": "Rock",
    "Queen": "Rock",
    "AC/DC": "Rock",
    "Nirvana": "Rock",
    "Green Day": "Rock",
    "Foo Fighters": "Rock",
    
    # Classical
    "Johannes Brahms": "Classical",
    "Wolfgang Amadeus Mozart": "Classical",
    "Ludwig van Beethoven": "Classical",
    "Tchaikovsky": "Classical",
    "Richard Strauss": "Classical",
    "Pyotr Ilyich Tchaikovsky": "Classical",
    "Hans Zimmer": "Classical",  # orchestral soundtrack
    "London Symphony Orchestra": "Classical",
    "Seattle Symphony Orchestra": "Classical",
    
    # Hip-hop
    "NF": "Hip-hop",
    "$uicideboy$": "Hip-hop",
    "Drake": "Hip-hop",
    "Kendrick Lamar": "Hip-hop",
    "Eminem": "Hip-hop",
    "Travis Scott": "Hip-hop",
    "Lil Nas X": "Hip-hop",
    "Post Malone": "Hip-hop",
    
    # Indie
    "The Neighbourhood": "Indie",
    "Tame Impala": "Indie",
    "Foster The People": "Indie",
    "The 1975": "Indie",
    "Vampire Weekend": "Indie",
    "Florence + The Machine": "Indie",
    "Arctic Monkeys": "Indie",
    
    # Country
    "Dan + Shay": "Country",
    "Zac Brown Band": "Country",
    "Kacey Musgraves": "Country",
    "Luke Bryan": "Country",
    "Carrie Underwood": "Country",
    "Thomas Rhett": "Country"
}

genre_to_artists = defaultdict(list)
for artist, genre in artist_genre_mapping.items():
    genre_to_artists[genre].append(artist)

def assign_favorite_artist(preferred_genre):
    if preferred_genre in genre_to_artists and len(genre_to_artists[preferred_genre]) > 0:
        return np.random.choice(genre_to_artists[preferred_genre])
    else:
        return np.nan

unbiased_spotify_user_dataset['favorite_artist'] = unbiased_spotify_user_dataset['preferred_genres'].apply(assign_favorite_artist)
unbiased_spotify_user_dataset = unbiased_spotify_user_dataset.dropna(subset=['favorite_artist'])
unbiased_spotify_user_dataset.to_csv("unbiased_spotify_user_dataset.csv", index=False)

#### Generating biased spotify user dataset

In [8]:
import pandas as pd
import numpy as np
from collections import defaultdict

unbiased_spotify_user_dataset = pd.read_csv('unbiased_spotify_user_dataset.csv')

def bias_dataset(df):
    df_biased = df.copy()

    # Education repartition by age
    def assign_education(age):
        if 15 <= age <= 24:
            return np.random.choice(
                ["Bachelor's","Master's", "PhD", "Some college", "High school"], 
                p=[0.3, 0.3, 0.2, 0.1, 0.1]
            )
        elif 25 <= age <= 34:
            return np.random.choice(
                ["Bachelor's","Master's", "PhD", "Some college", "High school"], 
                p=[0.2, 0.15, 0.15, 0.25, 0.25]
            )
        elif 35 <= age <= 44:
            return np.random.choice(
                ["Bachelor's","Master's", "PhD", "Some college", "High school"], 
                p=[0.15, 0.15, 0.10, 0.30,0.30]
            )
        elif 45 <= age <= 54:
            return np.random.choice(
                ["Bachelor's","Master's", "PhD", "Some college", "High school"], 
                p=[0.10, 0.10, 0.10, 0.35, 0.35]
            )
        else:  # 55+
            return np.random.choice(
                ["Bachelor's","Master's", "PhD", "Some college", "High school"], 
                p=[0.05, 0.05, 0.10, 0.4, 0.4]
            )

    df_biased["education_level"] = df_biased["age"].apply(assign_education)

    # Listening hours by age
    def assign_listening_hours(age):
        if 16 <= age <= 24:
            return np.random.normal(25, 5)
        elif 25 <= age <= 34:
            return np.random.normal(22, 5)
        elif 35 <= age <= 44:
            return np.random.normal(20, 4)
        elif 45 <= age <= 54:
            return np.random.normal(18, 4)
        else:
            return np.random.normal(15, 4)

    df_biased["listening_hours"] = df_biased["age"].apply(assign_listening_hours).clip(1, None)

    # Preferred genres by age & education
    def assign_genre(age, edu):
        age_genres = ["Pop", "Jazz", "Electronic", "Rock", "Classical", "Hip-hop", "Indie", "Country"]

        if edu in ["High school", "Some college"]:
            age_probs = [0.4, 0.05, 0.05, 0.3, 0.02, 0.15, 0.02, 0.01]
            edu_genres = ["Pop", "Rap", "Rock"]
        elif edu in ["Bachelor's", "Master's"]:
            age_probs = [0.25, 0.1, 0.2, 0.2, 0.05, 0.1, 0.05, 0.05]
            edu_genres = ["Pop", "Rock", "Electronic"]
        elif edu == "PhD":
            age_probs = [0.1, 0.25, 0.05, 0.1, 0.25, 0.05, 0.15, 0.05]
            edu_genres = ["Jazz", "Classical", "Indie"]
        else:  # inconnu
            age_probs = [0.125] * 8
            edu_genres = age_genres

        # Intersection age & education
        common = list(set(age_genres) & set(edu_genres))
        if common:
            if np.random.rand() < 0.8:  # 80% tirage biaisé par l'éducation
                return np.random.choice(common)
            else:  # 20% tirage biaisé par l'âge
                return np.random.choice(age_genres, p=age_probs)
        else:
            return np.random.choice(age_genres, p=age_probs)

    df_biased["preferred_genres"] = df_biased.apply(
        lambda row: assign_genre(row["age"], row["education_level"]), axis=1
    )

    # Mapping genre
    artist_to_genre ={
        # Pop
        "Britney Spears": "Pop", "Taylor Swift": "Pop", "Lauv": "Pop", "Ed Sheeran": "Pop",
        "Adele": "Pop", "Selena Gomez": "Pop", "Dua Lipa": "Pop", "Ariana Grande": "Pop",
        "Olivia Rodrigo": "Pop", "Maroon 5": "Pop", "Justin Bieber": "Pop", "Billie Eilish": "Pop",
        "Rita Ora": "Pop",
        # Jazz
        "Nina Simone": "Jazz", "Miles Davis": "Jazz", "John Coltrane": "Jazz",
        "Herbie Hancock": "Jazz", "Louis Armstrong": "Jazz",
        # Electronic
        "Calvin Harris": "Electronic", "Martin Garrix": "Electronic", "Zedd": "Electronic",
        "Marshmello": "Electronic", "Avicii": "Electronic", "David Guetta": "Electronic",
        "Tiësto": "Electronic", "Kygo": "Electronic", "Deadmau5": "Electronic",
        # Rock
        "Linkin Park": "Rock", "Coldplay": "Rock", "Imagine Dragons": "Rock", "Queen": "Rock",
        "AC/DC": "Rock", "Nirvana": "Rock", "Green Day": "Rock", "Foo Fighters": "Rock",
        # Classical
        "Johannes Brahms": "Classical", "Wolfgang Amadeus Mozart": "Classical",
        "Ludwig van Beethoven": "Classical", "Tchaikovsky": "Classical", "Richard Strauss": "Classical",
        "Pyotr Ilyich Tchaikovsky": "Classical", "Hans Zimmer": "Classical",
        "London Symphony Orchestra": "Classical", "Seattle Symphony Orchestra": "Classical",
        # Hip-hop
        "NF": "Hip-hop", "$uicideboy$": "Hip-hop", "Drake": "Hip-hop", "Kendrick Lamar": "Hip-hop",
        "Eminem": "Hip-hop", "Travis Scott": "Hip-hop", "Lil Nas X": "Hip-hop", "Post Malone": "Hip-hop",
        # Indie
        "The Neighbourhood": "Indie", "Tame Impala": "Indie", "Foster The People": "Indie",
        "The 1975": "Indie", "Vampire Weekend": "Indie", "Florence + The Machine": "Indie",
        "Arctic Monkeys": "Indie",
        # Country
        "Dan + Shay": "Country", "Zac Brown Band": "Country", "Kacey Musgraves": "Country",
        "Luke Bryan": "Country", "Carrie Underwood": "Country", "Thomas Rhett": "Country"
    }

    genre_to_artists = defaultdict(list)
    for artist, genre in artist_to_genre.items():
        genre_to_artists[genre].append(artist)

    def assign_artist(genre):
        if genre in genre_to_artists:
            return np.random.choice(genre_to_artists[genre])
        return np.random.choice([a for artists in genre_to_artists.values() for a in artists])

    df_biased["favorite_artist"] = df_biased["preferred_genres"].apply(assign_artist)

    return df_biased

biased_spotify_user_dataset = bias_dataset(unbiased_spotify_user_dataset)
biased_spotify_user_dataset.to_csv('biased_spotify_user_dataset.csv', index=False)
biased_spotify_user_dataset.head()

Unnamed: 0,user_id,age,gender,education_level,listening_hours,preferred_genres,favorite_artist,discovery_channel,country
0,1,57,Other,Bachelor's,20.108202,Electronic,Calvin Harris,Concert,Mexico
1,2,60,Female,Bachelor's,16.885878,Electronic,Zedd,Search,Canada
2,3,66,Male,Some college,17.487272,Hip-hop,Kendrick Lamar,Friend,Germany
3,4,13,Male,Some college,16.192137,Rock,Green Day,Social Media,France
4,5,16,Other,Bachelor's,25.468328,Pop,Selena Gomez,Playlist,Mexico


#### Generating biased user interactions dataset

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

def generate_user_song_interactions_target(users_df, songs_df, target_size=499230):

    n_users = len(users_df)
    
    base_tracks = target_size // n_users
    remainder = target_size % n_users
    
    repeated_users = users_df.loc[users_df.index.repeat(base_tracks)].reset_index(drop=True)
    
    if remainder > 0:
        extra_users = users_df.sample(n=remainder, replace=False, random_state=42).reset_index(drop=True)
        repeated_users = pd.concat([repeated_users, extra_users], ignore_index=True)
    
    sampled_tracks = songs_df.sample(n=target_size, replace=True, random_state=42).reset_index(drop=True)
    
    interactions = pd.DataFrame({
        "user_id": repeated_users["user_id"],
        "preferred_genres": repeated_users["preferred_genres"],
        "favorite_artist": repeated_users["favorite_artist"],
        "track_id": sampled_tracks["track_id"],
        "genre": sampled_tracks["genre"],
        "artist_name": sampled_tracks["artist_name"]
    })
    
    interactions["listen_count"] = np.random.poisson(lam=3, size=len(interactions))
    
    genre_mask = interactions["genre"] == interactions["preferred_genres"]
    interactions.loc[genre_mask, "listen_count"] += np.random.randint(3, 10, size=genre_mask.sum())
    
    artist_mask = interactions["artist_name"] == interactions["favorite_artist"]
    interactions.loc[artist_mask, "listen_count"] += np.random.randint(5, 15, size=artist_mask.sum())
    
    interactions["listen_count"] = interactions["listen_count"].clip(lower=1)
    
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2025, 1, 1)
    total_days = (end_date - start_date).days
    interactions["timestamp"] = pd.to_datetime(
        np.random.randint(0, total_days, size=len(interactions)), 
        unit='D', origin=start_date
    )
    
    return interactions[["user_id", "track_id", "listen_count", "timestamp"]]

biased_users = pd.read_csv("biased_spotify_user_dataset.csv")
songs = pd.read_csv("spotify_song_attributes.csv")

biased_user_interactions = generate_user_song_interactions_target(
    biased_users, 
    songs, 
    target_size=499230
)

biased_user_interactions.to_csv('biased_user_interactions.csv')