## Data Generation 

#### Here we used the google's gemini and Ollama which are LLMs, to get the other features for an artist. 

#### The Google's gemini had rate limits so it wasn't able to complete the whole dataset, hence on later stages we parallize the code as well. 

In [None]:
import pandas as pd
import json
import pycountry
import langcodes
import time
from google import genai

client = genai.Client(api_key="AIzaSyCGE3ssbTCcWCMmzlKmAAVD4xiPBqjGg0c")


def get_country_language_iso(artist_name):
    prompt = f"""
    What is the ISO 3166-1 alpha-2 country code of origin and the primary ISO 639-1 language code used in the songs of the artist '{artist_name}'?

    Return your answer as a JSON object with the keys 'country' and 'language'. If you are unsure or cannot determine this information, please return:

    {{ "country": null, "language": null }}

    The format of your response should strictly adhere to this JSON structure:

    {{ "country": "<ISO Country Code>", "language": "<ISO Language Code>" }}
    Do not give anything extra just the above response format.
    """

    try:
        
        response = client.models.generate_content(
            model="gemini-2.0-flash", contents=prompt
        )
        text = response.text.strip()
        import re
        match = re.search(r'\{[^{}]*"country"[^{}]*"language"[^{}]*\}', text)
        if match:
            json_str = match.group(0)
            data = json.loads(json_str)
            return data.get("country"), data.get("language")
        else:
            print(f"No valid JSON found for artist: {artist_name}")
            return None, None
    except Exception as e:
        print(f"Error processing {artist_name}: {e}")
        return None, None

def iso_to_name(country_code, language_code):
    try:
        country_name = pycountry.countries.get(alpha_2=country_code).name if country_code else None
    except:
        country_name = None

    try:
        language_name = langcodes.get(language_code).display_name() if language_code else None
    except:
        language_name = None

    return country_name, language_name


csv_path = "cleaned_nodes.csv"

df = pd.read_csv(csv_path)

if 'name' not in df.columns:
    raise ValueError("CSV must contain a column named 'name' with artist names.")

countries = []
languages = []

for artist in df['name']:
    print(f"Processing: {artist}")
    country_iso, lang_iso = get_country_language_iso(artist)
    country_name, lang_name = iso_to_name(country_iso, lang_iso)
    countries.append(country_name)
    languages.append(lang_name)
    time.sleep(1)


df['country'] = countries
df['language'] = languages

df.to_csv("artists_with_country_language.csv", index=False)

Processing: Byklubben
Processing: Kontra K
Processing: Christopher Martin
Processing: Jakob Hellman
Processing: Juice
Processing: Nehuda
Processing: VovaZiLvova
Processing: Yomi
Processing: Kauniit & Uhkarohkeat
Processing: Danny Elfman
Processing: Attractions
Processing: Pálmi Gunnarsson
Processing: Suur Papa
Processing: Vybz Kartel
Processing: Apocalyptica
Processing: Dua Lipa
Processing: Stefanie Sun
Processing: Twins
Processing: Aspova
Processing: Artistic Raw
Processing: Angela Ken
Processing: Labanoon
Processing: João Gomes
Processing: Daniela Reyes
Processing: Benny The Butcher
Error processing Benny The Butcher: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'gene

KeyboardInterrupt: 

In [None]:
import pandas as pd
import json
import pycountry
import langcodes
import time
import subprocess

def ask_ollama(prompt):
    try:
        result = subprocess.run(
            ["ollama", "run", "mistral"],
            input=prompt.encode(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        output = result.stdout.decode()

        try:
            json_data = json.loads(output.strip().split('\n')[-1])
        except:
            json_data = { "country": None, "language": None }
        return json_data
    except Exception as e:
        print(f"Ollama error: {e}")
        return { "country": None, "language": None }

def get_country_language_iso(artist_name):
    prompt = f"""
What is the ISO 3166-1 alpha-2 country code of origin and the primary ISO 639-1 language code used in the songs of the artist '{artist_name}'?

Return your answer as a JSON object with the keys 'country' and 'language'. If you are unsure or cannot determine this information, please return:

{{ "country": null, "language": null }}

The format of your response should strictly adhere to this JSON structure:

{{ "country": "<ISO Country Code>", "language": "<ISO Language Code>" }}
"""
    data = ask_ollama(prompt)
    return data.get("country"), data.get("language")

def iso_to_name(country_code, language_code):
    try:
        country_name = pycountry.countries.get(alpha_2=country_code).name if country_code else None
    except:
        country_name = None

    try:
        language_name = langcodes.get(language_code).display_name() if language_code else None
    except:
        language_name = None

    return country_name, language_name

csv_path = "cleaned_nodes.csv"

df = pd.read_csv(csv_path)

if 'name' not in df.columns:
    raise ValueError("CSV must contain a column named 'name' with artist names.")

countries = []
languages = []

for artist in df['name']:
    print(f"Processing: {artist}")
    country_iso, lang_iso = get_country_language_iso(artist)
    country_name, lang_name = iso_to_name(country_iso, lang_iso)
    countries.append(country_name)
    languages.append(lang_name)

df['country'] = countries
df['language'] = languages

df.to_csv("artists_with_country_language.csv", index=False)


## Data Validation Using MusicBrainz API

#### Next after generating the data we validated the data using the MusciBrainz API.

#### The MusicBrainz API had itself rate limits, hence the code was again parallized to run efficiently.

In [None]:
import pandas as pd
import requests
import time
from countryinfo import CountryInfo  


def get_artist_info(artist_name):
    url = f"https://musicbrainz.org/ws/2/artist/?query={artist_name}&fmt=json"
    headers = {
        'User-Agent': 'Nikhil/1.0 (nikhil22322@iiitd.ac.in)'
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 503:  
        print("Rate limit exceeded. Sleeping for 2 seconds...")
        time.sleep(2)  
        return get_artist_info(artist_name)  

    if response.status_code == 200:
        data = response.json()
        if data['artists']:
            artist = data['artists'][0]
            return artist['name'], artist.get('country', 'Unknown')
    return None, None

def get_country_language(country_name):
    try:
        country = CountryInfo(country_name)
        info = country.info()
        languages = info.get('languages', [])
        return languages[0] if languages else 'en' 
    except Exception as e:
        print(f"Error retrieving language for {country_name}: {e}")
        return 'en'  


csv_path = "cleaned_nodes.csv"

df = pd.read_csv(csv_path)


if 'name' not in df.columns:
    raise ValueError("CSV must contain a column named 'name' with artist names.")

countries = []
languages = []

# Process each artist
for artist in df['name']:
    print(f"Processing: {artist}")
    name, country = get_artist_info(artist)
    
  
    if country != 'Unknown':
        language = get_country_language(country)
    else:
        language = 'en'  
    countries.append(country)
    languages.append(language)
    #time.sleep(1)  

 
    df.loc[df['name'] == artist, 'country'] = country
    df.loc[df['name'] == artist, 'language'] = language
    df.to_csv("artists_with_country_language.csv", index=False)




Processing: Byklubben
Processing: Kontra K
Processing: Christopher Martin
Processing: Jakob Hellman
Processing: Juice
Processing: Nehuda
Processing: VovaZiLvova
Processing: Yomi
Processing: Kauniit & Uhkarohkeat
Processing: Danny Elfman
Processing: Attractions
Processing: Pálmi Gunnarsson
Processing: Suur Papa
Processing: Vybz Kartel
Processing: Apocalyptica
Processing: Dua Lipa
Processing: Stefanie Sun
Processing: Twins
Processing: Aspova
Processing: Artistic Raw
Processing: Angela Ken
Processing: Labanoon
Processing: João Gomes
Processing: Daniela Reyes
Processing: Benny The Butcher
Processing: Sharmoofers
Processing: Ray Dalton
Processing: Orange & Lemons
Processing: D1NO
Processing: Pep & Rash
Processing: B10
Processing: APO Hiking Society
Processing: Cristian Castro
Processing: Nobuo Uematsu
Processing: Kiana Ledé
Processing: SAMSONS
Processing: DON EAZY
Processing: Ez Mil
Processing: Sitti
Processing: Los Pikantes
Processing: Loren Allred
Processing: Darío Gómez
Processing: JEON 

KeyboardInterrupt: 

## Merging the Parallized Data

#### After generating the overall metadata for each artist, we proceed to combine into one final csv and named it `combined_dataset.csv`

In [None]:
import pandas as pd
from functools import reduce

csv1 = pd.read_csv("artists_with_country_language _25k-45k.csv")
csv2 = pd.read_csv("artists_with_country_language 20k_25k.csv")
csv3 = pd.read_csv("artists_with_country_language_45k.csv")
csv4 = pd.read_csv("artists_with_country_language_upto20k.csv")


dfs = [df.set_index('spotify_id') for df in [csv1, csv2, csv3, csv4]]

# Combine all using combine_first (fills missing values without creating _x/_y columns)
merged = dfs[0]
for df in dfs[1:]:
    merged = merged.combine_first(df)


merged.reset_index(inplace=True)

merged.to_csv("combined_dataset.csv", index=False)
