<a href="https://colab.research.google.com/github/SJinji/recommendation-system-with-last.fm-dataset/blob/main/Deezer_1_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! unzip /content/deezer-tech-test-DS-internship.zip

In [2]:
! pip install spotipy



In [3]:
# Import necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import spotipy
import string
import nltk
from spotipy.oauth2 import SpotifyClientCredentials
from tqdm.auto import tqdm

In [8]:
# Function to fetch artist info from Spotify API
def get_artist_info(spotify, name):
    # Search for artist in Spotify API
    results = spotify.search(q=f'artist:{name}', type='artist')
    spotify_name, img_url, genres, spotify_url = name, None, [], None

    if len(results['artists']['items']) > 0:
        # Sort the returned results based on how close they are to the query name (Edit Distance).
        items = sorted(results['artists']['items'], key=lambda x: nltk.edit_distance(name.lower(), x["name"].lower()))

        if len(items) > 0:
            artist = items[0]
            genres = artist["genres"]
            spotify_name = artist["name"]
            spotify_url = artist["external_urls"]["spotify"]
            image_list = artist["images"]
            if len(image_list) > 0:
                img_url = image_list[0]["url"]

    return spotify_name, img_url, genres, spotify_url

import time

# Function to fetch Spotify data for all unique artist names in the DataFrame
def fetch_spotify_data(original_df, client_id, client_secret):
    # Authenticate Spotify API client
    spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))
    spotify_data = pd.DataFrame(columns=["artistName", "spotifyName", "imageUrl", "genres", "spotifyUrl"])

    for artist_name_in in tqdm(list(original_df.artistName.unique())):
        artist_name = str(artist_name_in)
        try:
            # Fetch artist info from Spotify API
            print(f"Fetching data for artist: {artist_name}")
            spotify_name, image_url, genres_list, spotify_url = get_artist_info(spotify, artist_name)

            # Create a DataFrame row with fetched data
            df_row = pd.DataFrame({"artistName": artist_name, "spotifyName": spotify_name, "imageUrl": image_url, "genres": str(genres_list), "spotifyUrl": spotify_url}, index=[0])

            # Concatenate row to spotify_data DataFrame
            spotify_data = pd.concat([spotify_data, df_row], axis=0, ignore_index=True)

            # Introduce a delay of 0.5 seconds between requests to stay within the rate limit
            time.sleep(0.5)

        except Exception as e:
            # If an error occurs, print the artist name and error message and continue to the next artist
            print(f"Error fetching data for artist: {artist_name}")
            print("Error message:", str(e))
            continue

    return spotify_data


In [None]:
def main():
    # Load original data
    artists = pd.read_csv('deezer-business-case/data/artists.dat', sep="\t")
    tags = pd.read_csv('deezer-business-case/data/tags.dat', encoding="gbk", sep="\t")
    user_artists = pd.read_csv('deezer-business-case/data/user_artists.dat', sep="\t")
    user_friends = pd.read_csv('deezer-business-case/data/user_friends.dat', sep='\t')
    tag_artists = pd.read_csv('deezer-business-case/data/user_taggedartists.dat', sep='\t')

    # Merge data to create original_df DataFrame
    df1 = pd.merge(user_artists, user_friends, on="userID", how="outer")
    df2 = pd.merge(df1, artists, left_on="artistID", right_on="id", how="outer").drop(columns=["id", "url", "pictureURL"])
    df3 = pd.merge(tag_artists, tags, on="tagID", how="left")
    original_df = pd.merge(df2, df3, on=["userID", "artistID"], how="outer")

    # Rename columns for clarity
    original_df.rename(columns={"name": "artistName", "weight": "artistWeight"}, inplace=True)

    # Spotify API credentials
    client_id = "22ba1096ffda44fca858c1c6880ca020"
    client_secret = "2342a132a9a74186bdff50fce2a95778"

    # Fetch Spotify data for artists
    print("Fetching Spotify data...")
    spotify_data = fetch_spotify_data(original_df, client_id, client_secret)
    print("Spotify data fetched successfully!")

    # Merge Spotify data with the original DataFrame
    df = pd.merge(original_df, spotify_data, on="artistName", how="left")

    # Save the data to CSV files
    spotify_data.to_csv('spotify_data.csv', index=False)
    df.to_csv('original_and_spotify_data.csv', index=False)

if __name__ == "__main__":
    main()