In [165]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [166]:
data = 'dataset/df_cleaned.csv'
index_col = 0
df = pd.read_csv(data, sep=',', index_col=index_col)

pd.set_option('display.max_columns', None)

In [167]:
df['gender'] = df['gender'].astype('category')
df['nationality'] = df['nationality'].astype('category')
df['country'] = df['country'].astype('category')
df['region'] = df['region'].astype('category')
df['province'] = df['province'].astype('category')
df['birth_place'] = df['birth_place'].astype('category')
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')
df['active_start'] = pd.to_datetime(df['active_start'], errors='coerce')
df['description'] = df['description'].astype('string')


In [168]:
df['id_artist'] = df['id_artist'].astype('category')
df['id_album'] = df['id_album'].astype('category')
df['language'] = df['language'].astype('category')
df['album_type'] = df['album_type'].astype('category')
df['popularity'] = pd.to_numeric(df['popularity'], errors = 'coerce').astype('Int64')
df['n_sentences'] = df['n_sentences'].astype('Int64')
df['n_tokens'] = df['n_tokens'].astype('Int64')
df['disc_number'] = df['disc_number'].astype('Int64')
df['track_number'] = df['track_number'].astype('Int64')
df['explicit'] = df['explicit'].astype('bool')
df['name_artist'] = df['name_artist'].astype('string')
df['title'] = df['title'].astype('string')
df['album'] = df['album'].astype('string')
df['album_image'] = df['album_image'].astype('string')
df['lyrics'] = df['lyrics'].astype('string')
df['correct_release_date'] = pd.to_datetime(df['correct_release_date'], errors='coerce')

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11166 entries, TR934808 to TR552777
Data columns (total 45 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id_artist             11166 non-null  category      
 1   name_artist           11166 non-null  string        
 2   title                 11166 non-null  string        
 3   featured_artists      11166 non-null  object        
 4   language              11032 non-null  category      
 5   swear_IT              11166 non-null  int64         
 6   swear_EN              11166 non-null  int64         
 7   swear_IT_words        11166 non-null  object        
 8   swear_EN_words        11166 non-null  object        
 9   n_sentences           10959 non-null  Int64         
 10  n_tokens              10955 non-null  Int64         
 11  tokens_per_sent       10958 non-null  float64       
 12  char_per_tok          11090 non-null  float64       
 13  lexical_den

In [170]:
import ast

def safe_literal_eval(value):
    """
    Safely converts a string representation of a list into a Python list.
    Handles NaN/missing values by returning an empty list or pd.NA.
    """
    
    if pd.isna(value) or value in (None, 'NaN', '', '[<NA>]'):
        # Return an empty list for missing values if you plan to iterate over it
        return []
    try:
        # Use ast.literal_eval for safe conversion of string-to-list
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        # Handle cases where the string is malformed or not a list structure
        print(f"Warning: Could not convert value: {value}")
        return [] # Default to empty list on failure

df['swear_IT_words'] = df['swear_IT_words'].apply(safe_literal_eval)
df['swear_EN_words'] = df['swear_EN_words'].apply(safe_literal_eval)
df['featured_artists'] = df['featured_artists'].apply(safe_literal_eval)

## Adding artist related features 


In [171]:

mean_popularity_artist = df.groupby('id_artist')['popularity'].mean()
df['mean_popularity_artist'] = df['id_artist'].map(mean_popularity_artist)


mean_popularity_album = df.groupby('id_album')['popularity'].mean()
df['mean_popularity_album'] = df['id_album'].map(mean_popularity_album)

mean_duration_song_per_album = df.groupby('id_album')['duration_ms'].mean()
df['mean_duration_song_per_album'] = df['id_album'].map(mean_duration_song_per_album)



  mean_popularity_artist = df.groupby('id_artist')['popularity'].mean()
  mean_popularity_album = df.groupby('id_album')['popularity'].mean()
  mean_duration_song_per_album = df.groupby('id_album')['duration_ms'].mean()


In [172]:
print(df[['mean_duration_song_per_album', 'duration_ms', 'id_album']].head(10))

          mean_duration_song_per_album  duration_ms   id_album
id                                                            
TR934808                 189279.333333     207761.0  ALB115557
TR760029                 189279.333333     207761.0  ALB115557
TR916821                 189279.333333     193544.0  ALB115557
TR480968                 162034.750000     169000.0  ALB730959
TR585039                 161777.769231     194779.0  ALB436151
TR550335                 161777.769231     200329.0  ALB436151
TR170793                 161777.769231     171230.0  ALB436151
TR627195                 161777.769231     168513.0  ALB436151
TR628871                 178080.000000     178080.0  ALB672657
TR700756                 161777.769231     188534.0  ALB436151


## Feature track pop relative region 

In [173]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- STEP 1: Calculate the Raw Relative Popularity (without creating columns) ---

# 1. Use .transform('mean') to calculate the mean popularity for each region
#    and align it back to every row of the DataFrame.
#    This result is stored as a temporary Series, NOT a DataFrame column.
regional_avg_series = df.groupby('region')['popularity'].transform('mean')

# 2. Calculate the raw difference (also stored as a temporary Series)
raw_relative_popularity_series = df['popularity'] - regional_avg_series

# --- STEP 2: Normalize and Final Assignment ---

# 3. Instantiate the scaler
scaler = StandardScaler()

# 4. Fit/transform the temporary Series and assign the result directly to the new column
#    The result is a 2D NumPy array which is then assigned to the new DataFrame column.
df['Song_Pop_vs_Region_Avg_Zscore'] = scaler.fit_transform(
    raw_relative_popularity_series.values.reshape(-1, 1)
)

# Check the new feature (only the final column is displayed)
print("\nFinal Normalized Feature Statistics:")
print(df['Song_Pop_vs_Region_Avg_Zscore'].agg(['mean', 'std']))

print("\nComparison (Region and Final Z-Score):")
print(df[['name_artist', 'region', 'popularity', 'Song_Pop_vs_Region_Avg_Zscore']].head())


Final Normalized Feature Statistics:
mean   -1.787979e-18
std     1.000063e+00
Name: Song_Pop_vs_Region_Avg_Zscore, dtype: float64

Comparison (Region and Final Z-Score):
            name_artist    region  popularity  Song_Pop_vs_Region_Avg_Zscore
id                                                                          
TR934808  rosa chemical  Piemonte          46                       0.876089
TR760029  rosa chemical  Piemonte          46                       0.876089
TR916821  rosa chemical  Piemonte          39                       0.507473
TR480968  rosa chemical  Piemonte          47                       0.928748
TR585039  rosa chemical  Piemonte          41                       0.612792


  regional_avg_series = df.groupby('region')['popularity'].transform('mean')


## Avg time gap artist publishes abum

In [174]:
# --- STEP 1: Prepare the unique, sorted album data ---

# 1. Convert the date to a proper datetime format
# The 'errors=coerce' handles any messy non-date entries, turning them into NaT (Not a Time).
df['Album_Date'] = pd.to_datetime(df['correct_release_date'], errors='coerce')

# 2. Get unique albums per artist and drop any rows where the date conversion failed
unique_releases = df[['id_artist', 'Album_Date']].drop_duplicates().dropna(subset=['Album_Date'])

# 3. Sort the releases chronologically WITHIN each artist group
unique_releases = unique_releases.sort_values(by=['id_artist', 'Album_Date'])


# --- STEP 2: Calculate the time gaps (intervals) ---

# 4. Group by artist and calculate the difference between the current date and the previous one (.shift(1))
unique_releases['Previous_Date'] = unique_releases.groupby('id_artist')['Album_Date'].shift(1)

# 5. Calculate the time difference (Timedelta object)
unique_releases['Release_Gap'] = unique_releases['Album_Date'] - unique_releases['Previous_Date']

# 6. Convert the Timedelta object into a usable numeric value (e.g., number of days)
unique_releases['Release_Gap_Days'] = unique_releases['Release_Gap'].dt.days


# --- STEP 3: Aggregate the new feature and map it back ---

# 7. Calculate the Mean and Standard Deviation of the gaps for each artist
# The mean tells you the typical delay; the std tells you how consistent the pacing is.
artist_pacing_stats = unique_releases.groupby('id_artist')['Release_Gap_Days'].agg(
    ['mean', 'std']
).rename(columns={'mean': 'Artist_Avg_Release_Gap_Days', 'std': 'Artist_Std_Release_Gap_Days'})

# 8. Map the aggregated stats back to the main DataFrame
df = df.merge(
    artist_pacing_stats,
    on='id_artist',
    how='left'
)

# Optional cleanup
df = df.drop(columns=['Album_Date'])


# 9. Check the new features
print("Artist's Release Pacing Features:")
print(df[['name_artist', 'correct_release_date', 'Artist_Avg_Release_Gap_Days', 'Artist_Std_Release_Gap_Days']].head(10))

Artist's Release Pacing Features:
     name_artist correct_release_date  Artist_Avg_Release_Gap_Days  \
0  rosa chemical           2021-04-09                       144.75   
1  rosa chemical           2021-04-09                       144.75   
2  rosa chemical           2021-04-09                       144.75   
3  rosa chemical           2025-05-16                       144.75   
4  rosa chemical           2020-05-28                       144.75   
5  rosa chemical           2020-05-28                       144.75   
6  rosa chemical           2020-05-28                       144.75   
7  rosa chemical           2020-05-28                       144.75   
8  rosa chemical           2023-02-08                       144.75   
9  rosa chemical           2020-05-28                       144.75   

   Artist_Std_Release_Gap_Days  
0                   184.044005  
1                   184.044005  
2                   184.044005  
3                   184.044005  
4                   184.044005

  unique_releases['Previous_Date'] = unique_releases.groupby('id_artist')['Album_Date'].shift(1)
  artist_pacing_stats = unique_releases.groupby('id_artist')['Release_Gap_Days'].agg(


## words Per Minute (WPM)

In [175]:
# 1. Calculate the estimated Total Characters (the numerator)
# This serves as the proxy for total syllables delivered.
total_characters = df['n_tokens']

# 2. Calculate the Duration in Minutes (easier for division)
duration_minutes = df['duration_ms'] / 60000

# 3. Calculate the Final Characters Per Minute (CPM)
# Note: This is mathematically equivalent to your single-line formula.
# We calculate it this way to improve readability and avoid potential syntax errors.
df['Characters_Per_Minute'] = total_characters / duration_minutes

# Check the new feature
print(df[['name_artist', 'title', 'n_tokens', 'duration_ms', 'Characters_Per_Minute']].head())

     name_artist         title  n_tokens  duration_ms  Characters_Per_Minute
0  rosa chemical  ‚Äãpolka 2 :-/       911     207761.0             263.090763
1  rosa chemical         POLKA       675     207761.0             194.935527
2  rosa chemical  ‚Äãbritney ;-)       758     193544.0             234.985326
3  rosa chemical           CEO       382     169000.0             135.621302
4  rosa chemical        LONDRA       429     194779.0              132.14977


## Swear Word Rate

In [176]:
# Calculate Swear Word Rate (normalized by total words)
# This calculation avoids creating a persistent 'Total_Swears' column in df.
df['Swear_Rate'] = np.where(
    # Condition: Ensure n_tokens is not null AND greater than zero to prevent division by zero
    df['n_tokens'].notna() & (df['n_tokens'] > 0),

    # Value if True: Calculate the ratio (swear_IT + swear_EN) / n_tokens
    (df['swear_IT'] + df['swear_EN']) / df['n_tokens'],

    # Value if False: Set rate to 0 (no words to swear with)
    0
)

# Display the new feature and its components (for verification only)
print(df[['name_artist', 'title', 'swear_IT', 'swear_EN', 'n_tokens', 'Swear_Rate']].head())

     name_artist         title  swear_IT  swear_EN  n_tokens  Swear_Rate
0  rosa chemical  ‚Äãpolka 2 :-/        13         6       911    0.020856
1  rosa chemical         POLKA         9        12       675    0.031111
2  rosa chemical  ‚Äãbritney ;-)        16        12       758    0.036939
3  rosa chemical           CEO         8         3       382    0.028796
4  rosa chemical        LONDRA         1         0       429    0.002331


## Rank song based on popularity of the year 


In [177]:
# 1. Calculate the Rank_by_Year directly.
# The year extraction (pd.to_datetime(...).dt.year) creates a temporary Series
# that is used as the grouping key but is NOT assigned as a permanent column to df.
df['Rank_by_Year'] = (
    df.groupby(pd.to_datetime(df['correct_release_date'], errors='coerce').dt.year)['popularity']
    .rank(method='dense', ascending=False)
)

# 2. Verification Check (Requires temporary Year for filtering in the print statement)
# Create a temporary Series 'temp_year' for the filter condition only
temp_year = pd.to_datetime(df['correct_release_date'], errors='coerce').dt.year

print("Top 5 Tracks Ranked by Popularity in a Specific Year (Example - using year 2020):")
print(df[temp_year == 2020]
      [['name_artist', 'title', 'correct_release_date', 'popularity', 'Rank_by_Year']]
      .sort_values(by='Rank_by_Year', ascending=True)
      .head(5)
)

Top 5 Tracks Ranked by Popularity in a Specific Year (Example - using year 2020):
     name_artist                     title correct_release_date  popularity  \
272       thasup                  No Sleep           2020-03-20          91   
8826    highsnob                 Bad Bitch           2020-10-16          85   
7636    tony boy                   War RMX           2020-03-21          84   
8824    highsnob  Per odiarti non ho tempo           2020-10-16          82   
1560      piotta        Su di me (Outtake)           2020-10-30          79   

      Rank_by_Year  
272            1.0  
8826           2.0  
7636           3.0  
8824           4.0  
1560           5.0  


In [178]:
# 1. Recalculate the temporary Year Series for filtering (necessary for filtering)
temp_year = pd.to_datetime(df['correct_release_date'], errors='coerce').dt.year

# 2. Identify the last 10 unique, non-null years present in the data
last_10_years = temp_year.dropna().unique()
# Sort the years in descending order and select the first 20
last_10_years = np.sort(last_10_years)[::-1][:20]


# 3. Loop through the last 10 years and print the top 5 ranked tracks
print("\n--- Top 5 Tracks Ranked by Popularity (Last 20 Years) ---")
print("-----------------------------------------------------------------")

for year in last_10_years:
    # Filter the DataFrame for the current year
    yearly_df = df[temp_year == year].copy()

    # Sort and display the top 5 tracks for the current year
    top_5 = yearly_df[
        ['name_artist', 'title', 'popularity', 'Rank_by_Year']
    ].sort_values(by='Rank_by_Year', ascending=True).head(5)

    if not top_5.empty:
        print(f"\nüèÜ YEAR: {int(year)}")
        print(top_5.to_string(index=False))


--- Top 5 Tracks Ranked by Popularity (Last 20 Years) ---
-----------------------------------------------------------------

üèÜ YEAR: 2025
name_artist            title  popularity  Rank_by_Year
rondodasosa          MY LIFE          97           1.0
      fedez          My Life          97           1.0
     thasup Coraline In Love          97           1.0
    babaman       First Love          95           2.0
      fedez      I‚Äôm So High          94           3.0

üèÜ YEAR: 2024
name_artist                           title  popularity  Rank_by_Year
   tony boy                   Tutto a posto          78           1.0
  marracash                     Fotoromanzo          75           2.0
  marracash Promo Sky NBA 2016/17 Freestyle          75           2.0
  marracash                          Regole          75           2.0
      lazza                         Candida          75           2.0

üèÜ YEAR: 2023
  name_artist             title  popularity  Rank_by_Year
        luche 