In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
artists = 'dataset/artists.csv'
tracks = 'dataset/tracks.csv'

index_col = 0
df_artists = pd.read_csv(artists, sep=';', index_col=index_col)
df_tracks = pd.read_csv(tracks, index_col=index_col)

pd.set_option('display.max_columns', None)

In [3]:
df_artists['gender'] = df_artists['gender'].astype('category')
df_artists['nationality'] = df_artists['nationality'].astype('category')
df_artists['country'] = df_artists['country'].astype('category')
df_artists['region'] = df_artists['region'].astype('category')
df_artists['province'] = df_artists['province'].astype('category')
df_artists['birth_place'] = df_artists['birth_place'].astype('category')
df_artists['birth_date'] = pd.to_datetime(df_artists['birth_date'], errors='coerce')
df_artists['active_start'] = pd.to_datetime(df_artists['active_start'], errors='coerce')
df_artists['description'] = df_artists['description'].astype('string')
df_artists['name'] = df_artists['name'].astype('string')

In [4]:
df_tracks['id_artist'] = df_tracks['id_artist'].astype('category')
df_tracks['id_album'] = df_tracks['id_album'].astype('category')
df_tracks['language'] = df_tracks['language'].astype('category')
df_tracks['album_type'] = df_tracks['album_type'].astype('category')
df_tracks['stats_pageviews'] = pd.to_numeric(df_tracks['stats_pageviews'], errors='coerce')
df_tracks['year'] = pd.to_numeric(df_tracks['year'], errors='coerce')
df_tracks['month'] = pd.to_numeric(df_tracks['month'], errors='coerce')
df_tracks['day'] = pd.to_numeric(df_tracks['day'], errors='coerce')
df_tracks['popularity'] = pd.to_numeric(df_tracks['popularity'], errors='coerce')
df_tracks['disc_number'] = df_tracks['disc_number'].astype('Int64')
df_tracks['track_number'] = df_tracks['track_number'].astype('Int64')
df_tracks['explicit'] = df_tracks['explicit'].astype('bool')
df_tracks['modified_popularity'] = df_tracks['modified_popularity'].astype('bool')
df_tracks['album_release_date'] = pd.to_datetime(df_tracks['album_release_date'], errors='coerce')
df_tracks['name_artist'] = df_tracks['name_artist'].astype('string')
df_tracks['full_title'] = df_tracks['full_title'].astype('string')
df_tracks['title'] = df_tracks['title'].astype('string')
df_tracks['featured_artists'] = df_tracks['featured_artists'].astype('string')
df_tracks['primary_artist'] = df_tracks['primary_artist'].astype('string')
df_tracks['album_name'] = df_tracks['album_name'].astype('string')
df_tracks['album'] = df_tracks['album'].astype('string')
df_tracks['album_image'] = df_tracks['album_image'].astype('string')
df_tracks['lyrics'] = df_tracks['lyrics'].astype('string')

In [5]:
import ast # Import the Abstract Syntax Tree module for safe evaluation

# Assuming your DataFrame is df_tracks and it's already loaded

def safe_literal_eval(value):
    """
    Safely converts a string representation of a list into a Python list.
    Handles NaN/missing values by returning an empty list or pd.NA.
    """
    if pd.isna(value) or value in (None, 'NaN', ''):
        # Return an empty list for missing values if you plan to iterate over it
        return []
    try:
        # Use ast.literal_eval for safe conversion of string-to-list
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        # Handle cases where the string is malformed or not a list structure
        print(f"Warning: Could not convert value: {value}")
        return [] # Default to empty list on failure

# Apply the conversion to both columns
df_tracks['swear_IT_words'] = df_tracks['swear_IT_words'].apply(safe_literal_eval)
df_tracks['swear_EN_words'] = df_tracks['swear_EN_words'].apply(safe_literal_eval)

For the data preparation phase we start by performing a check of the ids for rows of both our dataframes to check for potential duplicates. A formal review of the primary ID columns yielded the following observations:

* df_tracks: Inspection of the track ID column revealed 73 instances of duplicated identifiers. To guarantee that each record is uniquely identifiable and to maintain the principle of one-to-one entity mapping, these duplicated rows will be managed immediately. IDs are of the format $\text{TR\#\#\#\#\#\#}$, so we generate new IDs compliant with this format to replace duplicated ones.

* df_artists: The artist ID column was found to be entirely sound, presenting no instances of duplicate IDs. Consequently, no corrective action is required for this dataframe regarding its primary keys.

In [6]:
# 1. Creare un set di tutti gli ID esistenti per un controllo rapido
existing_tracks_ids = set(df_tracks.index)
existing_artists_ids = set(df_artists.index)

# 2. Identificare le posizioni (indice booleano) degli indici duplicati.
#    Usiamo keep='first' per segnare solo la seconda, terza, ecc. occorrenza.
duplicate_mask_tracks = df_tracks.index.duplicated()
duplicate_mask_artists = df_artists.index.duplicated()
num_duplicates_tracks = duplicate_mask_tracks.sum()
num_duplicates_artists = duplicate_mask_artists.sum()
print("number of duplicate index for tracks:", num_duplicates_tracks)
print("number of duplicate index for artists:", num_duplicates_artists)

number of duplicate index for tracks: 73
number of duplicate index for artists: 0


In [7]:
# 1. Creare una maschera per identificare TUTTE le righe (inclusa la prima)
#    che hanno un indice duplicato.
all_duplicates_mask = df_tracks.index.duplicated(keep=False)

# 2. Filtrare il DataFrame per ottenere solo queste righe
df_duplicate_groups = df_tracks[all_duplicates_mask]

# 3. Ordinare per indice. Questo è fondamentale per vedere
#    le righe con lo stesso indice una accanto all'altra.
df_duplicate_groups_sorted = df_duplicate_groups.sort_index()

# 4. Stampare i gruppi di duplicati
if not df_duplicate_groups_sorted.empty:
    print("Mostro tutte le righe che hanno un indice duplicato, ordinate per ID:")
    # Stampiamo le prime 30 (o modifica il numero se vuoi vederne di più)
    print(df_duplicate_groups_sorted.head(30))
else:
    # Questo scenario si verifica se num_duplicates (dal tuo codice) era 0
    print("Nessuna riga con indice duplicato trovata.")

Mostro tutte le righe che hanno un indice duplicato, ordinate per ID:
            id_artist       name_artist  \
id                                        
TR108862  ART56320683     Bassi Maestro   
TR108862  ART07127070       Noyz Narcos   
TR135764  ART73965015            Ghemon   
TR135764  ART86549066        Emis Killa   
TR190585  ART78209349              Coez   
TR190585  ART66932389            Piotta   
TR192351  ART81071062         Club Dogo   
TR192351  ART88792008     Jake La Furia   
TR205970  ART80977821   Jack The Smoker   
TR205970  ART08456301           Rancore   
TR213881  ART07469279  Johnny Marsiglia   
TR213881  ART85046033          Gemitaiz   
TR230274  ART18853907              Alfa   
TR230274  ART08177154            Il Tre   
TR237380  ART25707984       Fabri Fibra   
TR237380  ART40229749            Baby K   
TR245683  ART48537029          Mistaman   
TR245683  ART78358659             Nitro   
TR247772  ART59609037        Cor Veleno   
TR247772  ART64850829      

In [8]:
import random

# --- ASSUMPTIONS ---
# 1. df_tracks is your DataFrame.
# 2. The column containing the track IDs is the index of the DataFrame.
# 3. There are 73 duplicated IDs that need replacement.

# --- 1. Identify Duplicated Rows and Count ---
# Find the boolean mask for rows where the ID (index) is duplicated,
# keeping only the duplicates *after* the first occurrence.
duplicated_mask = df_tracks.index.duplicated(keep='first')
num_duplicates_to_replace = duplicated_mask.sum() # Should be 73
print(num_duplicates_to_replace)

# --- 2. Define ID Generation Helper ---
def format_track_id(number, prefix='TR', padding=6):
    """Formats a number into a TRXXXXXX string."""
    # Uses f-string formatting to zero-pad the number to 6 digits
    return f"{prefix}{number:0{padding}d}"

# --- 3. Generate New Unique IDs with Collision Check ---

# Convert the existing index to a set for O(1) average time complexity lookups
existing_ids = set(df_tracks.index)
new_track_ids = []

# Range for 6-digit numbers (000000 to 999999)
MIN_ID = 0
MAX_ID = 999999 

print(f"Generating {num_duplicates_to_replace} random unique IDs...")

while len(new_track_ids) < num_duplicates_to_replace:
    # Generate a random 6-digit number
    random_num = random.randint(MIN_ID, MAX_ID)
    
    # Format it to the "TRXXXXXX" string
    new_id = format_track_id(random_num)
    
    # Check for collision against all existing IDs
    if new_id not in existing_ids:
        new_track_ids.append(new_id)
        # Immediately add the new ID to the existing_ids set to prevent
        # generating the same random ID twice during this loop
        existing_ids.add(new_id)

print("Finished generating unique IDs.")

# --- 4. Replace Duplicated IDs in the DataFrame Index ---

# Get the actual index values that need to be replaced (the index values of the duplicated rows)
indices_to_replace = df_tracks.index[duplicated_mask]

# Create a Series of the new IDs, matching the indices (positions) of the duplicated rows
new_ids_series = pd.Series(
    new_track_ids,
    index=indices_to_replace
)

# Replace the duplicated index values in-place
df_tracks.index.values[duplicated_mask] = new_ids_series.values

# --- Verification ---
print(f"\nGenerated {len(new_track_ids)} new unique IDs.")
print(f"Example new ID: {new_track_ids[0]}")
print(f"Check for duplicates after replacement: {df_tracks.index.duplicated().any()}")

73
Generating 73 random unique IDs...
Finished generating unique IDs.

Generated 73 new unique IDs.
Example new ID: TR784960
Check for duplicates after replacement: False


Name_artist, name and primary artist all highlights the same information. So we can prune two of them.
We checked the differences and (after normalization) we discovered that primary_artist and name_artist are identical while name is different.

In [9]:
df = df_tracks.join(df_artists, on='id_artist')

In [10]:
# Funzione helper per la normalizzazione
def normalize_series(series):
    # 1. Minuscolo
    s = series.str.lower()
    
    # 2. Rimuove accenti (es. 'è' -> 'e')
    # NFKD normalizza i caratteri, 'ascii' rimuove ciò che non è ascii (accenti)
    s = s.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    
    # 3. Rimuove caratteri speciali (tutto tranne lettere, numeri, spazi)
    # [^\w\s] significa "tutto ciò che NON è un carattere di parola (\w) o uno spazio (\s)"
    s = s.str.replace(r'[^\w\s]', '', regex=True)
    
    # 4. Rimuove spazi extra all'inizio/fine
    s = s.str.strip()
    
    # (Opzionale) Sostituisce spazi multipli con uno singolo
    s = s.str.replace(r'\s+', ' ', regex=True)
    
    return s

In [11]:
# Applichiamo la normalizzazione alle tre colonne
df['name'] = normalize_series(df['name'])
df['primary_artist'] = normalize_series(df['primary_artist'])
df['name_artist'] = normalize_series(df['name_artist'])
df['featured_artists'] = normalize_series(df['featured_artists'])

print(df)

            id_artist    name_artist  \
id                                     
TR934808  ART04205421  rosa chemical   
TR760029  ART04205421  rosa chemical   
TR916821  ART04205421  rosa chemical   
TR480968  ART04205421  rosa chemical   
TR585039  ART04205421  rosa chemical   
...               ...            ...   
TR434449  ART02733420      marracash   
TR826351  ART02733420      marracash   
TR529809  ART02733420      marracash   
TR280904  ART02733420      marracash   
TR552777  ART02733420      marracash   

                                                 full_title  \
id                                                            
TR934808    ​polka 2 :-/ by Rosa Chemical (Ft. Ernia & Guè)   
TR760029         POLKA by Rosa Chemical (Ft. Thelonious B.)   
TR916821  ​britney ;-) by Rosa Chemical (Ft. MamboLosco ...   
TR480968                  CEO by Rosa Chemical (Ft. Taxi B)   
TR585039                LONDRA by Rosa Chemical (Ft. Rkomi)   
...                                   

In [12]:
# Controlla se le due colonne sono SEMPRE identiche
are_artists_identical = (df['primary_artist'] == df['name_artist']).all()

if are_artists_identical:
    print("Analisi: 'primary_artist' e 'name_artist' sono sempre identici dopo la normalizzazione.")
    print("La colonna 'name_artist' è probabilmente ridondante.")
else:
    print("Analisi: 'primary_artist' e 'name_artist' NON sono sempre identici.")
    
    # Quante sono le differenze?
    diff_count = (df['primary_artist'] != df['name_artist']).sum()
    print(f"Ci sono {diff_count} righe in cui differiscono.")
    
    # Visualizza le righe in cui sono diversi
    print("\n--- Esempi di differenze tra artisti ---")
    diff_df = df[df['primary_artist'] != df['name_artist']]
    print(diff_df[['primary_artist', 'name_artist', 'primary_artist', 'name_artist']].head())

Analisi: 'primary_artist' e 'name_artist' sono sempre identici dopo la normalizzazione.
La colonna 'name_artist' è probabilmente ridondante.


In [13]:
# Cerca tracce omonime (dove il nome della traccia è uguale al nome dell'artista)
self_titled_tracks = df[df['name'] != df['primary_artist']]
print(self_titled_tracks[['name', 'primary_artist']])


                  name primary_artist
id                                   
TR317207   gue pequeno            gue
TR446826   gue pequeno            gue
TR228275   gue pequeno            gue
TR697556   gue pequeno            gue
TR391415   gue pequeno            gue
...                ...            ...
TR794750  samuel heron   samuel costa
TR102539  samuel heron   samuel costa
TR178809   joey funboy       joey ita
TR589443   joey funboy       joey ita
TR735987   joey funboy       joey ita

[870 rows x 2 columns]


In [14]:
df[df['featured_artists']=='gue']

Unnamed: 0_level_0,id_artist,name_artist,full_title,title,featured_artists,primary_artist,language,album,stats_pageviews,swear_IT,swear_EN,swear_IT_words,swear_EN_words,year,month,day,n_sentences,n_tokens,tokens_per_sent,char_per_tok,lexical_density,avg_token_per_clause,bpm,centroid,rolloff,flux,rms,zcr,flatness,spectral_complexity,pitch,loudness,album_name,album_release_date,album_type,disc_number,track_number,duration_ms,explicit,popularity,album_image,id_album,lyrics,modified_popularity,name,gender,birth_date,birth_place,nationality,description,active_start,active_end,province,region,country,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
TR635953,ART39344115,lazza,Gucci Ski Mask by Lazza (Ft. Guè),Gucci Ski Mask,gue,lazza,pl,Re Mida: Aurum,88716.0,6,3,"[cazzo, culo, figa, merda, sedere]","[bitch, pussy, threesome]",2019.0,1.0,10.0,91.0,825.0,9.065934,3.865385,0.615385,8.009709,123.92,0.1679,2174.9546,1.3637,0.2652,0.0814,0.8777,29.1066,2287.3564,30.3436,Re Mida,2019-03-01,album,1,14,205640.0,True,55.0,https://i.scdn.co/image/ab67616d0000b273e78ad8...,ALB915773,"Mob Yah, ahah, yah, ahah, ehi, ehi, uh Paga '...",False,lazza,M,1994-08-22,Scampia,Italia,"rapper, musicista e produttore discografico it...",2012-01-01,,Napoli,Campania,Italia,40.899988,14.241052
TR307970,ART39344115,lazza,ESTRANEO by Lazza (Ft. Guè),ESTRANEO,gue,lazza,it,LOCURA,37837.0,0,1,[],[sex],2024.0,9.0,20.0,66.0,541.0,8.196970,3.896050,0.501040,7.213333,107.98,0.1538,1725.3698,1.2919,0.2694,0.0693,0.8679,31.1101,2166.5421,30.1096,LOCURA,2024-11-26,album,1,12,202222.0,True,54.0,https://i.scdn.co/image/ab67616d0000b27390d7f0...,ALB548890,Ero solo un kid come tanti Conosco la street c...,False,lazza,M,1994-08-22,Scampia,Italia,"rapper, musicista e produttore discografico it...",2012-01-01,,Napoli,Campania,Italia,40.899988,14.241052
TR754884,ART24123617,babaman,Guerrieri Del Microfono by Babaman (Ft. Guè),Guerrieri Del Microfono,gue,babaman,it,Dinamite,,0,0,[],[],2008.0,4.0,15.0,69.0,448.0,6.492754,4.000000,0.487923,6.492754,82.97,0.1397,944.4244,1.3939,0.2316,0.0487,0.9574,23.0737,2520.3973,24.2116,Dinamite (2022 Remaster),2008-04-15,album,1,7,236466.0,False,17.0,https://i.scdn.co/image/ab67616d0000b273deb984...,ALB888033,Senti il boss sulla traccia Ti sbuffo il fumo ...,False,babaman,M,1975-08-08,Rho,Italia,cantante italiano,NaT,,Milano,Lombardia,Italia,45.528878,9.041560
TR637567,ART63613967,massimo pericolo,Di Persona by Massimo Pericolo (Ft. Guè),Di Persona,gue,massimo pericolo,it,Le cose cambiano (Deluxe),11324.0,7,3,"[cazzo, fesso, merda, troia]","[bitch, sex, shit]",2023.0,12.0,1.0,59.0,530.0,8.983051,3.836207,0.512931,9.137931,101.96,0.1327,1563.7660,1.4490,0.2364,0.0598,0.8614,24.6695,2868.5690,26.6769,Le cose cambiano,2023-12-01,album,1,12,153048.0,False,39.0,https://i.scdn.co/image/ab67616d0000b2735f6fb9...,ALB624299,"Greg Willen, non dormire Se c'è un problema, ...",False,massimo pericolo,M,1993-11-30,Gallarate,Italia,criminale e rapper italiano,NaT,,Varese,Lombardia,Italia,45.659895,8.793201
TR174934,ART63613967,massimo pericolo,Sarabamba by Massimo Pericolo (Ft. Guè),Sarabamba,gue,massimo pericolo,it,,7826.0,8,6,"[cazzo, culo, figa, merda, scopare, water]","[bitch, pussy]",2022.0,7.0,22.0,64.0,707.0,11.046875,3.831633,0.506803,7.288660,134.00,0.1444,1063.0009,1.3122,0.3934,0.0458,0.8974,41.3902,2400.1657,48.2537,Sarabamba (feat. Guè),2022-07-22,single,1,1,241368.0,True,30.0,https://i.scdn.co/image/ab67616d0000b27340c685...,ALB809652,"Crookers on the beat Yo, bitch Non c'è bisogn...",False,massimo pericolo,M,1993-11-30,Gallarate,Italia,criminale e rapper italiano,NaT,,Varese,Lombardia,Italia,45.659895,8.793201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TR299069,ART02733420,marracash,∞ LOVE by Marracash (Ft. Guè),∞ LOVE,gue,marracash,pl,"NOI, LORO, GLI ALTRI (Deluxe)",211204.0,0,0,[],[],2021.0,11.0,19.0,79.0,686.0,8.683544,3.856899,0.483816,9.270270,124.97,0.1202,1049.9968,1.2484,0.3282,0.0466,0.8947,28.8465,2705.2946,39.4323,"NOI, LORO, GLI ALTRI",2021-11-19,album,1,3,218367.0,True,66.0,https://i.scdn.co/image/ab67616d0000b2733c29e9...,ALB342728,"Gioielli e fama, Vuitton e Prada Non contan na...",False,marracash,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,,Enna,Sicilia,Italia,37.747452,14.397271
TR546598,ART02733420,marracash,S.e.n.i.c.a.r. by Marracash (Ft. Guè),S.e.n.i.c.a.r.,gue,marracash,en,King Del Rap,78196.0,12,1,"[cazzo, coglioni, figa, stronzo, troia]",[bitch],2011.0,10.0,31.0,71.0,553.0,7.788732,4.259557,0.525151,8.014493,142.08,0.1456,1540.4489,1.2291,0.3213,0.0644,0.7648,43.3535,1783.3378,37.3538,King Del Rap,2011-01-01,album,1,8,205720.0,True,62.0,https://i.scdn.co/image/ab67616d0000b27367b145...,ALB288289,"Ahah, Marracash, D&G, Dogo Gang Scopiamo le vo...",False,marracash,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,,Enna,Sicilia,Italia,37.747452,14.397271
TR528636,ART02733420,marracash,Di Nascosto by Marracash (Ft. Guè),Di Nascosto,gue,marracash,pl,Status (Vendetta Edition),28321.0,3,0,"[coglione, figa, ricchione]",[],2015.0,1.0,20.0,97.0,782.0,8.061856,4.183976,0.525223,8.593407,103.96,0.1630,2057.5471,1.4823,0.2585,0.0749,0.8956,31.2779,2442.2098,28.2993,Status,2015-02-10,album,1,16,236840.0,True,37.0,https://i.scdn.co/image/ab67616d0000b273659696...,ALB100713,"Baby, come riempi bene i jeans Streptococco pe...",False,marracash,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,,Enna,Sicilia,Italia,37.747452,14.397271
TR577582,ART02733420,marracash,Casbah Flow by Marracash (Ft. Guè),Casbah Flow,gue,marracash,pl,Double Dragon Mixtape,5746.0,1,0,[culo],[],2004.0,4.0,30.0,76.0,649.0,8.539474,4.207705,0.534338,6.904255,88.17,0.1157,1284.3598,1.2729,0.2388,0.0531,0.9011,23.5174,2412.8074,26.4889,Hagakure,2008-08-18,compilation,1,4,183306.0,False,14.0,https://i.scdn.co/image/ab67616d0000b273dbfe6c...,ALB396952,"(Ah, la zona è Barona) Se se (Marracash), Luck...",False,marracash,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,,Enna,Sicilia,Italia,37.747452,14.397271


In [15]:
df.drop(columns=['name', 'primary_artist'], inplace=True)

Active_end column is completely empty so we can drop it

In [16]:
df.drop(columns=['active_end'], inplace=True)

In [46]:
df

Unnamed: 0_level_0,id_artist,name_artist,full_title,title,featured_artists,language,album,stats_pageviews,swear_IT,swear_EN,swear_IT_words,swear_EN_words,year,month,day,n_sentences,n_tokens,tokens_per_sent,char_per_tok,lexical_density,avg_token_per_clause,bpm,centroid,rolloff,flux,rms,zcr,flatness,spectral_complexity,pitch,loudness,album_name,album_release_date,album_type,disc_number,track_number,duration_ms,explicit,popularity,album_image,id_album,lyrics,modified_popularity,gender,birth_date,birth_place,nationality,description,active_start,province,region,country,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
TR934808,ART04205421,rosa chemical,​polka 2 :-/ by Rosa Chemical (Ft. Ernia & Guè),​polka 2 :-/,ernia gue,pl,FOREVER AND EVER,196033.0,13,6,"[cazzo, cesso, coglioni, figa, merda, palle, p...","[bitch, fuck, porno, pussy]",2021.0,4.0,2.0,102.0,911.0,8.931373,4.170455,0.575284,8.133929,135.32,0.1858,2895.7767,1.4499,0.1786,0.1046,0.8202,25.7148,2311.1779,17.8675,FOREVER AND EVER,2021-04-09,album,1,3,207761.0,True,46.0,https://i.scdn.co/image/ab67616d0000b2736d5e14...,ALB115557,"Oplà, ah Bdope, chiama due b— Mi candiderei c...",False,M,1998-01-30,Grugliasco,Italia,rapper e cantautore italiano (1998-),2015-01-01,Torino,Piemonte,Italia,45.068046,7.577620
TR760029,ART04205421,rosa chemical,POLKA by Rosa Chemical (Ft. Thelonious B.),POLKA,thelonious b,en,FOREVER AND EVER,164450.0,9,12,"[cazzo, culo, frocio, puttana, sega, troia]","[escort, negro, sex, sexy, shit]",2020.0,3.0,6.0,56.0,675.0,12.053571,4.280851,0.648936,12.500000,129.37,0.2071,3378.4605,1.3533,0.2020,0.1175,0.6739,29.1089,1892.1924,21.4595,FOREVER AND EVER,2021-04-09,album,1,3,207761.0,True,46.0,https://i.scdn.co/image/ab67616d0000b2736d5e14...,ALB115557,"Greg Willen, non dormire (Brr-poh) T-T-Troppi...",False,M,1998-01-30,Grugliasco,Italia,rapper e cantautore italiano (1998-),2015-01-01,Torino,Piemonte,Italia,45.068046,7.577620
TR916821,ART04205421,rosa chemical,​britney ;-) by Rosa Chemical (Ft. MamboLosco ...,​britney ;-),mambolosco radical,en,FOREVER AND EVER,58313.0,16,12,"[bastardo, cazzo, culo, merda, troia]","[bastardo, bitch, bitches, cock, fuck]",2021.0,2.0,19.0,88.0,758.0,8.613636,4.075251,0.556856,8.422222,133.68,0.1833,2037.4847,1.3822,0.2552,0.0800,0.7842,26.9762,2484.3503,29.4590,FOREVER AND EVER,2021-04-09,album,1,1,193544.0,True,39.0,https://i.scdn.co/image/ab67616d0000b2736d5e14...,ALB115557,"Mothz Yeah, yeah, yeah-yeah Bdope, chiama due ...",False,M,1998-01-30,Grugliasco,Italia,rapper e cantautore italiano (1998-),2015-01-01,Torino,Piemonte,Italia,45.068046,7.577620
TR480968,ART04205421,rosa chemical,CEO by Rosa Chemical (Ft. Taxi B),CEO,taxi b,it,OKAY OKAY !! - EP,39890.0,8,3,"[cazzo, culo, fottere, merda, pompino, sega, t...","[fuck, porno, shit]",2019.0,3.0,8.0,37.0,382.0,10.324324,4.023881,0.534328,6.701754,162.22,0.1048,1156.3781,1.5499,0.1971,0.0436,0.8764,14.2956,2984.6109,20.5067,OKAY OKAY 2,2025-05-16,single,1,2,169000.0,True,47.0,https://i.scdn.co/image/ab67616d0000b27367c03d...,ALB730959,Designer sui vestiti penso di essere un outlet...,False,M,1998-01-30,Grugliasco,Italia,rapper e cantautore italiano (1998-),2015-01-01,Torino,Piemonte,Italia,45.068046,7.577620
TR585039,ART04205421,rosa chemical,LONDRA by Rosa Chemical (Ft. Rkomi),LONDRA,rkomi,en,FOREVER AND EVER,35432.0,1,0,[cazzo],[],2020.0,5.0,29.0,48.0,429.0,8.937500,3.922857,0.491429,8.411765,105.87,0.1421,1693.4542,1.0886,0.2369,0.0695,0.8571,36.6951,1572.0499,25.3407,FOREVER,2020-05-28,album,1,8,194779.0,True,41.0,https://i.scdn.co/image/ab67616d0000b273fcdb60...,ALB436151,"Bdope (Yeah) Vuole solo me, non fare la gelos...",False,M,1998-01-30,Grugliasco,Italia,rapper e cantautore italiano (1998-),2015-01-01,Torino,Piemonte,Italia,45.068046,7.577620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TR434449,ART02733420,marracash,Badabum Cha Cha - Gabri Ponte & Paki Rmx (Radi...,Badabum Cha Cha - Gabri Ponte & Paki Rmx (Radi...,,it,Badabum Cha Cha (The Remixes),,0,0,[],[],,,,75.0,624.0,8.320000,3.907251,0.521079,6.240000,123.78,0.1092,1140.5174,1.4162,0.2364,0.0497,0.9118,20.9760,3067.5304,26.3440,Che Ne Sanno I 2000 (feat. Danti),2016-07-01,single,1,1,234374.0,False,57.0,https://i.scdn.co/image/ab67616d0000b2733a7b5b...,ALB248347,"Qui non va, ma questo badabum cha cha Tira for...",False,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,Enna,Sicilia,Italia,37.747452,14.397271
TR826351,ART02733420,marracash,Senza Un Posto Nel Mondo - Alternative Version...,Senza Un Posto Nel Mondo - Alternative Version,tiziano ferro,it,Senza Un Posto Nel Mondo – Single,,1,0,[fortuna],[],2015.0,11.0,9.0,76.0,602.0,7.921053,4.098004,0.500907,5.016667,172.27,0.1570,1588.1965,1.1842,0.2960,0.0707,0.9159,41.3077,2537.0837,33.5024,Status,2015-02-10,album,1,20,248330.0,True,24.0,https://i.scdn.co/image/ab67616d0000b273659696...,ALB100713,Odio questa città (città) Sappiamo che la vera...,False,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,Enna,Sicilia,Italia,37.747452,14.397271
TR529809,ART02733420,marracash,Badabum Cha Cha - Gabri Ponte & Paki Rmx (Exte...,Badabum Cha Cha - Gabri Ponte & Paki Rmx (Exte...,,it,Badabum Cha Cha (The Remixes),,0,0,[],[],,,,74.0,570.0,7.702703,4.073741,0.516187,6.867470,123.82,0.1200,1130.7621,1.4001,0.2298,0.0489,0.8892,21.4218,3071.3804,25.4605,Che Ne Sanno I 2000 (feat. Danti),2016-07-01,single,1,1,234374.0,False,57.0,https://i.scdn.co/image/ab67616d0000b2733a7b5b...,ALB248347,"Qui non va, ma questo badabum cha cha Tira for...",False,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,Enna,Sicilia,Italia,37.747452,14.397271
TR280904,ART02733420,marracash,Tempo by Marracash,Tempo,,it,,,0,1,[],[porno],2005.0,1.0,1.0,32.0,304.0,9.500000,3.824373,0.465950,7.238095,86.99,0.1503,1495.6718,1.2661,0.2901,0.0638,0.9404,26.1900,2754.6887,32.5964,Status,2015-02-10,album,1,7,302173.0,True,37.0,https://i.scdn.co/image/ab67616d0000b273659696...,ALB100713,É crudo il suono e questa vibra che ti do Crud...,False,M,1979-05-22,Nicosia,Italia,rapper e produttore discografico italiano (1979-),1999-01-01,Enna,Sicilia,Italia,37.747452,14.397271


### full_title & title attributes

The two attributes should correspond on paper since they identify the tracks name. Full_title contains also the performer by including the "by (artist_name)" and featuring "Ft. (featured_artists)".

This explains how full_title has more unique values compared to title. But by looking at the actual title name contained in th first portion of full_title we notice that the two columns correspond.

We perform a regex serach to make sure this occurs across all records allowing us to discard one of the two

In [17]:
df_title = df_tracks[['full_title', 'title']].copy()

# The split occurs only at the *last* instance of '  by '
split_series = df_title['full_title'].str.rsplit(' by', n=1)

# print(split_series.head())

# Select the first element of the resulting list (the part before the last '  by ')
df_title['cleaned_attribute'] = split_series.str[0]



# --- STEP 1: NORMALIZE ALL QUOTES AND APOSTROPHES (The Source of the Remaining Error) ---
# Normalize smart apostrophe '’' (Right Single Quote, U+2019) to straight apostrophe "'"
df_title['cleaned_attribute'] = df_title['cleaned_attribute'].str.replace('’', "'", regex=False)
df_title['title'] = df_title['title'].str.replace('’', "'", regex=False)

# NEW: Normalize opening smart single quote '‘' (Left Single Quote, U+2018) to straight apostrophe "'"
df_title['cleaned_attribute'] = df_title['cleaned_attribute'].str.replace('‘', "'", regex=False)
df_title['title'] = df_title['title'].str.replace('‘', "'", regex=False)

# Normalize smart double quotes '“' (U+201C) and '”' (U+201D) to straight double quote '"'
df_title['cleaned_attribute'] = df_title['cleaned_attribute'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)
df_title['title'] = df_title['title'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)

# --- STEP 2: NORMALIZE WHITESPACE (Ensuring all previous work is preserved) ---
# Strip Leading/Trailing Whitespace
df_title['cleaned_attribute'] = df_title['cleaned_attribute'].str.strip()
df_title['title'] = df_title['title'].str.strip()

# Replace multiple spaces with a single space
df_title['cleaned_attribute'] = df_title['cleaned_attribute'].str.replace(r'\s+', ' ', regex=True)
df_title['title'] = df_title['title'].str.replace(r'\s+', ' ', regex=True)

# --- FINAL COMPARISON ---
are_columns_equal_final = (df_title['cleaned_attribute'] == df_title['title']).all()

print(f"Are the columns equal after final, comprehensive normalization? {are_columns_equal_final}")

# Check the remaining mismatched rows (should now be 0)
final_mismatched_rows = df_title[df_title['cleaned_attribute'] != df_title['title']]
print(f"Number of rows still unequal: {len(final_mismatched_rows)}")

# Print the remaining mismatched rows for inspection
if len(final_mismatched_rows) > 0:
    print("\nSample of remaining mismatched rows:")
    # We will print the original full title and the two cleaned versions
    rows_to_display = final_mismatched_rows.head(10)
    print(rows_to_display[['full_title', 'title', 'cleaned_attribute']])

    # To inspect the exact difference, look at the cleaned series
    print("\nCleaned Series for First Mismatched Row:")
    first_id = rows_to_display.index[0]
    print(f"Title (cleaned): '{df_title['title'].loc[first_id]}'")
    print(f"Attribute (cleaned): '{df_title['cleaned_attribute'].loc[first_id]}'")

Are the columns equal after final, comprehensive normalization? True
Number of rows still unequal: 0


check between name_artist and artist exstracted from full_title

In [18]:
df_artist_and_feat = df_tracks[['full_title', 'name_artist', 'featured_artists']].copy()

# --- Step 1: Extract 'artist_and_feat' (Artist + Features) ---
split_series_1 = df_artist_and_feat['full_title'].str.rsplit(' by', n=1)
df_artist_and_feat['artist_and_feat'] = split_series_1.str[1]
df_artist_and_feat.drop(columns=['full_title'], inplace=True)

# --- Step 2: Separate 'cleaned_artist' from 'cleaned_feat' ---
split_series_2 = df_artist_and_feat['artist_and_feat'].str.rsplit('(Ft.', n=1)
df_artist_and_feat['cleaned_artist'] = split_series_2.str[0]
df_artist_and_feat['cleaned_feat'] = split_series_2.str[1].str.replace(r'\)$', '', regex=True)

split_series_3 = df_artist_and_feat['cleaned_artist'].str.rsplit('&', n=1)
df_artist_and_feat['cleaned_artist'] = split_series_3.str[0]

# The Ampersand Feature is the part after the '&'
ampersand_feat = split_series_3.str[1]


# --- NEW LOGIC: Move Ampersand Features to 'cleaned_feat' if '(Ft....)' was empty ---

# 1. Create a boolean mask where 'cleaned_feat' is currently NaN/missing (i.e., had no (Ft. ) content)
mask_empty_feat = df_artist_and_feat['cleaned_feat'].isna() | (df_artist_and_feat['cleaned_feat'].str.strip() == '')

# 2. Update 'cleaned_feat' using the mask:
#    Where mask is True (features were empty), fill with the ampersand_feat content.
#    We use fillna('') on ampersand_feat to ensure we don't try to fill with NaN itself.
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].mask(
    mask_empty_feat,
    ampersand_feat.fillna('').str.strip() # Strip the feature before inserting
)


# 1. Split the cleaned_artist string by the FIRST comma (and only take the part before it)
# This extracts 'Rosa Chemical' from 'Rosa Chemical, Mehdi (ITA), ...'
df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.split(',', n=1).str[0]

# 2. Re-strip to remove any space that was next to the comma (e.g., 'Rosa Chemical ' from 'Rosa Chemical, ...')
df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.strip()


# --- Re-run Final Comparison ---

# 1. Normalize all smart single quotes/apostrophes ('’' and '‘') to the straight quote (''')
df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.replace('’', "'", regex=False).str.replace('‘', "'", regex=False)
df_artist_and_feat['name_artist'] = df_artist_and_feat['name_artist'].str.replace('’', "'", regex=False).str.replace('‘', "'", regex=False)

# 2. Normalize smart double quotes ('“' and '”') to the straight double quote ('"')
df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)
df_artist_and_feat['name_artist'] = df_artist_and_feat['name_artist'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)



# --- Step 3: Comprehensive Cleaning and Normalization ---

# NEW: Apply initial strip to remove standard leading/trailing spaces from the split
df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.strip()

# Create clean series for comparison, applying AGGRESSIVE cleaning to both
# This targets invisible characters (like Zero Width Space) and inconsistent Unicode whitespace (\s+)

df_artist_and_feat['cleaned_artist'] = df_artist_and_feat['cleaned_artist'].str.replace(r'\s+', ' ', regex=True).str.strip()
df_artist_and_feat['name_artist'] = df_artist_and_feat['name_artist'].str.replace(r'\s+', ' ', regex=True).str.strip()


# --- Final Comparison ---
# Compare the aggressively cleaned series
are_names_equal_final = (df_artist_and_feat['cleaned_artist'] == df_artist_and_feat['name_artist']).all()

print(f"Are the artist names? {are_names_equal_final}")

# Identify and print the remaining mismatched rows using the normalized series
final_mismatched_rows = df_artist_and_feat[df_artist_and_feat['cleaned_artist'] != df_artist_and_feat['name_artist']]
print(f"Number of rows still unequal: {len(final_mismatched_rows)}")
# Print the remaining mismatched rows for inspection
if len(final_mismatched_rows) > 0:
    print("\nSample of remaining mismatched rows:")
    # We display the original columns and the two normalized versions for true inspection
    rows_to_display = final_mismatched_rows.head(10)
    print(rows_to_display[['name_artist', 'cleaned_artist']])

    print("\nCleaned Series for First Mismatched Row (After Aggressive Strip):")
    first_id = rows_to_display.index[0]
    # Use the normalized series for the clearest inspection
    print(f"name_artist (normalized): '{df_artist_and_feat['name_artist'].loc[first_id]}'")
    print(f"cleaned_artist (normalized): '{df_artist_and_feat['cleaned_artist'].loc[first_id]}'")

df_artist_and_feat.drop(columns=['artist_and_feat'], inplace=True)


print(df_artist_and_feat[['name_artist', 'cleaned_artist']].head(10))

Are the artist names? True
Number of rows still unequal: 0
            name_artist cleaned_artist
id                                    
TR934808  Rosa Chemical  Rosa Chemical
TR760029  Rosa Chemical  Rosa Chemical
TR916821  Rosa Chemical  Rosa Chemical
TR480968  Rosa Chemical  Rosa Chemical
TR585039  Rosa Chemical  Rosa Chemical
TR550335  Rosa Chemical  Rosa Chemical
TR170793  Rosa Chemical  Rosa Chemical
TR627195  Rosa Chemical  Rosa Chemical
TR628871  Rosa Chemical  Rosa Chemical
TR700756  Rosa Chemical  Rosa Chemical


check between cleaned_featured_artist(extracted from full title) and featured_artist

In [19]:
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].str.strip()

# Make & in ,
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].str.replace('&', ',', regex=False)
df_artist_and_feat['featured_artists'] = df_artist_and_feat['featured_artists'].str.replace('&', ',', regex=False)

df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].str.replace(r'\s+', ' ', regex=True).str.strip()

df_artist_and_feat['featured_artists'] = df_artist_and_feat['featured_artists'].fillna('')
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].fillna('')

# 1. Normalize all smart single quotes/apostrophes ('’' and '‘') to the straight quote (''')
df_artist_and_feat['featured_artists'] = df_artist_and_feat['featured_artists'].str.replace('’', "'", regex=False).str.replace('‘', "'", regex=False)
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].str.replace('’', "'", regex=False).str.replace('‘', "'", regex=False)

# 2. Normalize smart double quotes ('“' and '”') to the straight double quote ('"')
df_artist_and_feat['featured_artists'] = df_artist_and_feat['featured_artists'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)
df_artist_and_feat['cleaned_feat'] = df_artist_and_feat['cleaned_feat'].str.replace('“', '"', regex=False).str.replace('”', '"', regex=False)



list_featured_artists = df_artist_and_feat['featured_artists'].str.split(',').apply(
        lambda x: [item.strip() for item in x if item.strip()] if isinstance(x, list) else []
    )

    # 3. Sort the list of artists alphabetically
sorted_featured_artists = list_featured_artists.apply(lambda x: sorted(x))

list_cleaned_feat = df_artist_and_feat['cleaned_feat'].str.split(',').apply(
        lambda x: [item.strip() for item in x if item.strip()] if isinstance(x, list) else []
    )

    # 3. Sort the list of artists alphabetically
sorted_cleaned_feat = list_cleaned_feat.apply(lambda x: sorted(x))

df_artist_and_feat['featured_artists'] = sorted_featured_artists.apply(lambda x: ', '.join(x))
df_artist_and_feat['cleaned_feat'] = sorted_cleaned_feat.apply(lambda x: ', '.join(x))



# Identify and print the remaining mismatched rows using the normalized series
final_mismatched_rows = df_artist_and_feat[df_artist_and_feat['cleaned_feat'] != df_artist_and_feat['featured_artists']]
print(f"Number of rows still unequal: {len(final_mismatched_rows)}")

# Print the remaining mismatched rows for inspection
if len(final_mismatched_rows) > 0:
    print("\nSample of remaining mismatched rows:")
    # We display the original columns and the two normalized versions for true inspection
    rows_to_display = final_mismatched_rows.head(10)
    print(rows_to_display[['featured_artists', 'cleaned_feat']])

    print("\nCleaned Series for First Mismatched Row (After Aggressive Strip):")
    first_id = rows_to_display.index[0]
    # Use the normalized series for the clearest inspection
    print(f"featured_artists (normalized): '{df_artist_and_feat['featured_artists'].loc[first_id]}'")
    print(f"cleaned_feat (normalized): '{df_artist_and_feat['cleaned_feat'].loc[first_id]}'")

Number of rows still unequal: 413

Sample of remaining mismatched rows:
         featured_artists   cleaned_feat
id                                      
TR266736                           Mothz
TR281032                       Manu Chao
TR811171                     Mara Sattei
TR822203                     Mara Sattei
TR397308                   Tiziano Ferro
TR212338                     Mara Sattei
TR372774                     Mara Sattei
TR993112                     Mara Sattei
TR444969                     Mara Sattei
TR479694                     Mara Sattei

Cleaned Series for First Mismatched Row (After Aggressive Strip):
featured_artists (normalized): ''
cleaned_feat (normalized): 'Mothz'


In [20]:
df['featured_artists'] = normalize_series(df_artist_and_feat['cleaned_feat'])

Now full title column is redundant: the featured artist has been extracted and the title column is correct.

In [21]:
df.drop(columns=['full_title'], inplace=True)

## Language attribute
Most present language for main lyrics are italian. english and polish. We checked most of these languages and they don't seem to respect the main language of the lyrics.

So we decided to run a SOTA language model to detect based on the tokens of he lyrics colmn the language of the track

In [22]:
import fasttext
import regex as re  

df_language = df_tracks[['language', 'lyrics', 'n_sentences']].copy()

def normalize_text(text):
    if pd.isna(text): return ""
    # Normalize smart quotes to straight quotes
    text = re.sub(r'[‘’]', "'", str(text))
    text = re.sub(r'[“”]', '"', text)
    # Aggressively remove characters that might be noise or confuse the model (e.g., emojis, non-standard symbols)
    text = re.sub(r'[^\w\s\.\,\'\"]', '', text, flags=re.UNICODE)
    return text

df_language['lyrics_normalized'] = df_language['lyrics'].apply(normalize_text)

print(df_language['lyrics_normalized'].head())

ModuleNotFoundError: No module named 'fasttext'

In [None]:
MODEL_PATH = 'lid.176.bin'

model = fasttext.load_model(MODEL_PATH)

def detect_language_safe(text, model):
    """
    Safely detects the language and confidence using FastText.
    Fortified to handle DataFrame edge cases (NaN, None, short strings).
    Returns a tuple (language_code, confidence_score) or (None, 0.0).
    """
    # 1. Explicitly check for NaN/None and ensure string conversion
    if pd.isna(text):
        return None, 0.0
    
    # Ensure it's a string and strip whitespace
    text_str = str(text).strip()
    
    # FIX: Remove newline and carriage return characters, as FastText requires a single line
    text_str = text_str.replace('\n', ' ').replace('\r', ' ')
    
    # FastText needs a minimum amount of text (let's keep the minimum length check)
    if len(text_str) < 20: 
        # Optionally log which records were too short
        # print(f"Skipping record due to short length: {text_str[:10]}...")
        return None, 0.0
    
    try:
        # k=1 asks for the single best prediction
        predictions = model.predict(text_str, k=1) 
        
        # predictions[0] is the label list: ['label__it']
        # predictions[1] is the probability list: [0.99]
        label = predictions[0][0].replace('__label', '')
        confidence = predictions[1][0]
        
        return label, confidence
    except Exception as e:
        # If an exception is still caught, print a detailed message 
        # to help diagnose the specific content causing the crash.
        print(f"FastText Prediction failed for input starting: '{text_str[:50]}...'")
        print(f"Error details: {e}")
        return None, 0.0

results = df_language['lyrics_normalized'].apply(
    lambda x: detect_language_safe(x, model)
)

# Unpack the Series of tuples into the two new columns

# The first element of the tuple is the language code
df_language['most_probable_language'] = results.apply(lambda x: x[0])

# The second element of the tuple is the confidence score
df_language['confidence'] = results.apply(lambda x: x[1])

# Displaying the new columns (optional)
print(df_language[['language', 'most_probable_language', 'confidence', 'n_sentences']].head())

In [None]:
import pandas as pd
# Assuming df_language is your DataFrame with the 'confidence' column

# Import the necessary plotting library for display
import matplotlib.pyplot as plt 

df_language['most_probable_language'].hist(
    bins=20, # Number of bins (intervals) for the histogram
    edgecolor='black'
)

plt.title('Distribution of FastText predicted languages')
plt.xlabel('Predicted Language')
plt.ylabel('Number of Records (Frequency)')
plt.show()


print(df_language['most_probable_language'].value_counts())

## album, album_name, album id

While the column album seems more reasonable and coherent, it contains multiple null values.
Some album in "album_name" appear truncated and incomplete.

We decided to keep the normalization for better readability and to have normalized occurrences.

To create a new correct version of the column showing the album relative to every tracks we decided to do 3 major choices:

    #Choice 1 (for null 'album'): Use 'album_name_norm',
    
    #Choice 2 (for Mismatch): Use 'album_norm',
    
    #Choice 3 (for Match): mantain 'album_norm' (the same with 'album_name_norm')

In [None]:
import numpy as np
import random

print("--- Creazione di 'correct_album' ---")

# Applica la normalizzazione alle due colonne originali
df['album_norm'] = normalize_series(df['album'])
df['album_name_norm'] = normalize_series(df['album_name'])

# Definisci le condizioni per la colonna 'correct_album'
conditions = [
    (df['album'].isnull()), # Priorità 1: Se 'album' è nullo...
    (df['album_norm'] != df['album_name_norm']), # Priorità 2: Se c'è mismatch...
    (df['album_norm'] == df['album_name_norm'])  # Priorità 3: Se c'è match...
]

# Definisci le scelte corrispondenti
choices = [
    df['album_name_norm'], # ...usa 'album_name_norm'
    df['album_norm'],      # ...usa 'album_norm'
    df['album_norm']       # ...usa 'album_norm'
]

# Crea la colonna 'correct_album'
df['correct_album'] = np.select(conditions, choices, default=np.nan)
print("Colonna 'correct_album' creata con successo.")


print("\n--- [FASE 3]: Assegnazione di 'id_album_final' (Algoritmo 1-a-1) ---")

# --- 3.1: Preparazione ---

# Ordina gli album per frequenza (dà priorità agli album più grandi)
album_order = df['correct_album'].value_counts().index

# Filtra i dati per creare la mappa dei candidati
df_candidates = df.dropna(subset=['correct_album', 'id_album'])
track_counts = df_candidates.groupby(['correct_album', 'id_album']).size().to_frame('count')

# Ordina i candidati per album e poi per frequenza
track_counts = track_counts.sort_values(['correct_album', 'count'], ascending=[True, False])

# Crea un dizionario di liste di candidati: {'Album': ['id_più_freq', 'id_secondo_più_freq']}
all_id_candidates = track_counts.reset_index().groupby('correct_album')['id_album'].apply(list).to_dict()

# --- 3.2: Esecuzione del Loop ---

used_ids = set() # Set per gli ID già "presi"
final_album_to_id_map = {} # La nostra mappa pulita finale

def generate_new_id():
    new_id = f"ALB{random.randint(100000, 999999)}"
    while new_id in used_ids:
        new_id = f"ALB{random.randint(100000, 999999)}"
    return new_id

print(f"Inizio processamento di {len(album_order)} album per assegnazione ID...")

# Itera sugli album in ordine di priorità
for album_name in album_order:
    
    candidate_ids = all_id_candidates.get(album_name, []) # Lista di ID candidati
    assigned_id = None # Flag

    # Cerca il primo ID valido (non già usato)
    for potential_id in candidate_ids:
        if potential_id not in used_ids:
            assigned_id = potential_id
            used_ids.add(assigned_id) # "Prenota" l'ID
            final_album_to_id_map[album_name] = assigned_id
            break # Passa all'album successivo
    
    # Se non è stato trovato nessun ID valido (o non c'erano candidati)
    if assigned_id is None:
        new_id = generate_new_id()
        used_ids.add(new_id)
        final_album_to_id_map[album_name] = new_id

print("Processamento ID terminato. Mappa 1-a-1 creata.")

# --- 3.3: Applicazione Finale ---

# Applica la mappa pulita al DataFrame
df['id_album_final'] = df['correct_album'].map(final_album_to_id_map)

print("\n--- VERIFICA FINALE ---")

# Controlla la relazione 1-a-1
check_ids_per_album = df.groupby('correct_album')['id_album_final'].nunique()
check_albums_per_id = df.groupby('id_album_final')['correct_album'].nunique()

print(f"Album con più di 1 ID: {(check_ids_per_album > 1).sum()}")
print(f"ID con più di 1 Album: {(check_albums_per_id > 1).sum()}")

print("\n--- Esempio di 10 righe pulite: ---")
print(df[['correct_album', 'id_album', 'id_album_final']].sample(10))

--- [FASE 1]: Definizione della Funzione di Normalizzazione ---
--- [FASE 2]: Creazione di 'correct_album' ---
Colonna 'correct_album' creata con successo.

--- [FASE 3]: Assegnazione di 'id_album_final' (Algoritmo 1-a-1) ---
Inizio processamento di 1884 album per assegnazione ID...
Processamento ID terminato. Mappa 1-a-1 creata.

--- [FASE 4]: VERIFICA FINALE ---
Album con più di 1 ID: 0
ID con più di 1 Album: 0

--- Esempio di 10 righe pulite: ---
                    correct_album   id_album id_album_final
id                                                         
TR692137                      bv3  ALB898686      ALB510321
TR490800           penna capitale  ALB671458      ALB671458
TR923135                  tommaso  ALB681595      ALB681595
TR142935                 memories  ALB542686      ALB174840
TR580500                  paprika  ALB635728      ALB141827
TR588115        pianeta di miller  ALB927488      ALB927488
TR460635   di vizi di forma virtu  ALB291656      ALB291656
TR3996

In [25]:
df['album'] = df['correct_album']
df.drop(columns=['album_name', 'album_norm', 'album_name_norm', 'correct_album'], inplace=True)
df['id_album'] = df['id_album_final']
df.drop(columns=['id_album_final'], inplace=True)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11166 entries, TR934808 to TR552777
Data columns (total 52 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id_artist             11166 non-null  object        
 1   name_artist           11166 non-null  object        
 2   title                 11166 non-null  string        
 3   featured_artists      11166 non-null  object        
 4   language              11061 non-null  category      
 5   album                 11161 non-null  object        
 6   stats_pageviews       4642 non-null   float64       
 7   swear_IT              11166 non-null  int64         
 8   swear_EN              11166 non-null  int64         
 9   swear_IT_words        11166 non-null  object        
 10  swear_EN_words        11166 non-null  object        
 11  year                  10728 non-null  float64       
 12  month                 9969 non-null   float64       
 13  day        

## Stats page views

As considered in data understanding phase, almost 60% of records is missing (Nan) so we decided to drop the column.

In [29]:
df.drop(columns=['stats_pageviews'], inplace=True)