In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("final_cleaned_data.csv", sep = ';' )
df.head()

Unnamed: 0,Rank,Title,Artists,Date,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,# of Artist,Artist (Ind.),# of Nationality,Nationality,Continent,Points (Total),Points (Ind for each Artist/Nat),id,Song URL,Loudness_norm
0,1,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2023-05-29,0.668,0.758,-5.176,0.033,0.483,0.0,...,Artist 1,Eslabon Armado,Nationality 1,Mexico,Latin-America,200,100.0,3qQbCzHBycnDpGskqOWY0E,https://open.spotify.com/track/3qQbCzHBycnDpGs...,0.849862
1,2,WHERE SHE GOES,Bad Bunny,2023-05-29,0.652,0.8,-4.019,0.061,0.143,0.629,...,Artist 1,Bad Bunny,Nationality 1,Puerto Rico,Latin-America,199,199.0,7ro0hRteUMfnOioTFI5TG1,https://open.spotify.com/track/7ro0hRteUMfnOio...,0.883423
2,3,La Bebe - Remix,"Yng Lvcas, Peso Pluma",2023-05-29,0.812,0.479,-5.678,0.333,0.213,0.0,...,Artist 1,Yng Lvcas,Nationality 1,Mexico,Latin-America,198,99.0,2UW7JaomAMuX9pZrjVpHAU,https://open.spotify.com/track/2UW7JaomAMuX9pZ...,0.835301
3,4,Cupid - Twin Ver.,FIFTY FIFTY,2023-05-29,0.783,0.592,-8.332,0.033,0.435,0.0,...,Artist 1,FIFTY FIFTY,Nationality 1,South Korea,Asia,197,197.0,7FbrGaHYVDmfr7KoLIZnQ7,https://open.spotify.com/track/7FbrGaHYVDmfr7K...,0.758318
4,5,un x100to,"Grupo Frontera, Bad Bunny",2023-05-29,0.569,0.724,-4.076,0.047,0.228,0.0,...,Artist 1,Grupo Frontera,Nationality 1,Mexico,Latin-America,196,98.0,6pD0ufEQq0xdHSsRbg9LBK,https://open.spotify.com/track/6pD0ufEQq0xdHSs...,0.881769


In [3]:
# Check the column names in the dataframe
print(df.columns.tolist())
print(f"\nDataframe shape: {df.shape}")

['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL', 'Loudness_norm']

Dataframe shape: (467061, 21)


In [4]:
# =============================================================================
# STEP 1: LOAD AND EXPLORE RAW CHART DATA
# =============================================================================

print("="*70)
print("STEP 1: LOADING RAW CHART DATA")
print("="*70)

df_charts = pd.read_csv("final_cleaned_data.csv", sep=';')
df_charts['Date'] = pd.to_datetime(df_charts['Date'])
df_charts['Year'] = df_charts['Date'].dt.year

print(f"\n‚úì Dataset loaded:")
print(f"  Total rows: {len(df_charts):,}")
print(f"  Columns: {df_charts.columns.tolist()}")
print(f"  Date range: {df_charts['Date'].min().date()} to {df_charts['Date'].max().date()}")
print(f"  Unique songs (id): {df_charts['id'].nunique():,}")
print(f"  Unique artists: {df_charts['Artist (Ind.)'].nunique():,}")

print(f"\nüìÖ Year distribution:")
print(df_charts['Year'].value_counts().sort_index())

print("\nFirst few rows:")
print(df_charts.head())

STEP 1: LOADING RAW CHART DATA

‚úì Dataset loaded:
  Total rows: 467,061
  Columns: ['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL', 'Loudness_norm', 'Year']
  Date range: 2017-01-01 to 2023-05-29
  Unique songs (id): 9,161
  Unique artists: 1,507

üìÖ Year distribution:
Year
2017    72182
2018    73000
2019    72998
2020    73199
2021    72998
2022    72988
2023    29696
Name: count, dtype: int64

First few rows:
   Rank              Title                     Artists       Date  \
0     1    Ella Baila Sola  Eslabon Armado, Peso Pluma 2023-05-29   
1     2     WHERE SHE GOES                   Bad Bunny 2023-05-29   
2     3    La Bebe - Remix       Yng Lvcas, Peso Pluma 2023-05-29   
3     4  Cupid - Twin Ver.                 FIFTY FIFTY 2023-05-2

In [5]:
# =============================================================================
# STEP 2: SPLIT BY YEAR (BEFORE ANY AGGREGATION)
# =============================================================================

print("\n" + "="*70)
print("STEP 2: SPLITTING BY YEAR (2017-2021 TRAIN / 2022-2023 TEST)")
print("="*70)

train_years = [2017, 2018, 2019, 2020, 2021]  
test_years = [2022, 2023]                      

df_charts_train = df_charts[df_charts['Year'].isin(train_years)].copy()
df_charts_test = df_charts[df_charts['Year'].isin(test_years)].copy()

print(f"\n‚úì Chart data split:")
print(f"  Training period: {df_charts_train['Date'].min().date()} to {df_charts_train['Date'].max().date()}")
print(f"  Test period:     {df_charts_test['Date'].min().date()} to {df_charts_test['Date'].max().date()}")
print(f"  Training rows:   {len(df_charts_train):,} ({len(df_charts_train)/len(df_charts)*100:.1f}%)")
print(f"  Test rows:       {len(df_charts_test):,} ({len(df_charts_test)/len(df_charts)*100:.1f}%)")
print(f"  Training songs:  {df_charts_train['id'].nunique():,}")
print(f"  Test songs:      {df_charts_test['id'].nunique():,}")


STEP 2: SPLITTING BY YEAR (2017-2021 TRAIN / 2022-2023 TEST)

‚úì Chart data split:
  Training period: 2017-01-01 to 2021-12-31
  Test period:     2022-01-01 to 2023-05-29
  Training rows:   364,377 (78.0%)
  Test rows:       102,684 (22.0%)
  Training songs:  7,572
  Test songs:      2,039


In [6]:
# =============================================================================
# STEP 3: AGGREGATE BY SONG ID (ONE ROW PER SONG PER PERIOD)
# =============================================================================

print("\n" + "="*70)
print("STEP 3: AGGREGATING CHART PERFORMANCE BY SONG")
print("="*70)

# Training songs (2017-2021)
print("\nüìä Aggregating TRAINING songs (2017-2021)...")
song_perf_train = df_charts_train.groupby('id').agg({
    'Rank': ['min', 'mean', 'count'],
    'Date': ['min', 'max'],
    'Title': 'first',
    'Artist (Ind.)': 'first'
}).reset_index()

song_perf_train.columns = ['id', 'best_rank', 'avg_rank', 'total_weeks_charted',
                            'first_appearance', 'last_appearance', 'Title', 'Artist']

print(f"  Unique songs: {len(song_perf_train):,}")
print(f"  Date range: {song_perf_train['first_appearance'].min().date()} to "
      f"{song_perf_train['last_appearance'].max().date()}")

# Test songs (2022-2023)
print("\nüìä Aggregating TEST songs (2022-2023)...")
song_perf_test = df_charts_test.groupby('id').agg({
    'Rank': ['min', 'mean', 'count'],
    'Date': ['min', 'max'],
    'Title': 'first',
    'Artist (Ind.)': 'first'
}).reset_index()

song_perf_test.columns = ['id', 'best_rank', 'avg_rank', 'total_weeks_charted',
                           'first_appearance', 'last_appearance', 'Title', 'Artist']

print(f"  Unique songs: {len(song_perf_test):,}")
print(f"  Date range: {song_perf_test['first_appearance'].min().date()} to "
      f"{song_perf_test['last_appearance'].max().date()}")


STEP 3: AGGREGATING CHART PERFORMANCE BY SONG

üìä Aggregating TRAINING songs (2017-2021)...
  Unique songs: 7,572
  Date range: 2017-01-01 to 2021-12-31

üìä Aggregating TEST songs (2022-2023)...
  Unique songs: 2,039
  Date range: 2022-01-01 to 2023-05-29


In [7]:
# =============================================================================
# STEP 4: CALCULATE POPULARITY SCORES AND CREATE LABELS
# =============================================================================

print("\n" + "="*70)
print("STEP 4: CALCULATING POPULARITY SCORES AND LABELS")
print("="*70)

# -----------------------------------------------------------------------------
# TRAINING SONGS (2017-2021)
# -----------------------------------------------------------------------------
print("\nüìä Processing TRAINING songs (2017-2021)...")

# Calculate peak score (higher rank = higher score)
song_perf_train['peak_score'] = (201 - song_perf_train['best_rank']) / 200 * 100

# Calculate longevity score (cap at 20 weeks)
song_perf_train['longevity_score'] = np.minimum(song_perf_train['total_weeks_charted'] / 20, 1.0) * 100

# Calculate popularity score (weighted combination)
weight_peak = 0.4
weight_longevity = 0.6
song_perf_train['popularity_score'] = (
    weight_peak * song_perf_train['peak_score'] + 
    weight_longevity * song_perf_train['longevity_score']
)

# Create label using TRAINING threshold only
train_threshold = song_perf_train['popularity_score'].quantile(0.7)
song_perf_train['popularity_label'] = song_perf_train['popularity_score'].apply(
    lambda x: 'Popular' if x >= train_threshold else 'Not Popular'
)

print(f"  Training threshold (70th percentile): {train_threshold:.2f}")
print(f"  Score range: {song_perf_train['popularity_score'].min():.2f} - "
      f"{song_perf_train['popularity_score'].max():.2f}")

# Show score distributions
print(f"\n  Score component stats:")
print(f"    Peak score       - mean: {song_perf_train['peak_score'].mean():.2f}, "
      f"std: {song_perf_train['peak_score'].std():.2f}")
print(f"    Longevity score  - mean: {song_perf_train['longevity_score'].mean():.2f}, "
      f"std: {song_perf_train['longevity_score'].std():.2f}")
print(f"    Popularity score - mean: {song_perf_train['popularity_score'].mean():.2f}, "
      f"std: {song_perf_train['popularity_score'].std():.2f}")

# -----------------------------------------------------------------------------
# TEST SONGS (2022-2023)
# -----------------------------------------------------------------------------
print("\nüìä Processing TEST songs (2022-2023)...")

# Calculate peak score
song_perf_test['peak_score'] = (201 - song_perf_test['best_rank']) / 200 * 100

# Calculate longevity score
song_perf_test['longevity_score'] = np.minimum(song_perf_test['total_weeks_charted'] / 20, 1.0) * 100

# Calculate popularity score
song_perf_test['popularity_score'] = (
    weight_peak * song_perf_test['peak_score'] + 
    weight_longevity * song_perf_test['longevity_score']
)

# ‚ö†Ô∏è CRITICAL: Use TRAINING threshold for test labels (no data leakage!)
song_perf_test['popularity_label'] = song_perf_test['popularity_score'].apply(
    lambda x: 'Popular' if x >= train_threshold else 'Not Popular'
)

print(f"  Using training threshold: {train_threshold:.2f}")
print(f"  Score range: {song_perf_test['popularity_score'].min():.2f} - "
      f"{song_perf_test['popularity_score'].max():.2f}")

# Show score distributions
print(f"\n  Score component stats:")
print(f"    Peak score       - mean: {song_perf_test['peak_score'].mean():.2f}, "
      f"std: {song_perf_test['peak_score'].std():.2f}")
print(f"    Longevity score  - mean: {song_perf_test['longevity_score'].mean():.2f}, "
      f"std: {song_perf_test['longevity_score'].std():.2f}")
print(f"    Popularity score - mean: {song_perf_test['popularity_score'].mean():.2f}, "
      f"std: {song_perf_test['popularity_score'].std():.2f}")

# -----------------------------------------------------------------------------
# LABEL DISTRIBUTION
# -----------------------------------------------------------------------------
print("\n" + "="*70)
print("LABEL DISTRIBUTION")
print("="*70)

print(f"\nüè∑Ô∏è  TRAINING LABELS (2017-2021):")
train_label_counts = song_perf_train['popularity_label'].value_counts()
print(f"  Popular:     {train_label_counts.get('Popular', 0):,} "
      f"({train_label_counts.get('Popular', 0)/len(song_perf_train)*100:.1f}%)")
print(f"  Not Popular: {train_label_counts.get('Not Popular', 0):,} "
      f"({train_label_counts.get('Not Popular', 0)/len(song_perf_train)*100:.1f}%)")

print(f"\nüè∑Ô∏è  TEST LABELS (2022-2023):")
test_label_counts = song_perf_test['popularity_label'].value_counts()
print(f"  Popular:     {test_label_counts.get('Popular', 0):,} "
      f"({test_label_counts.get('Popular', 0)/len(song_perf_test)*100:.1f}%)")
print(f"  Not Popular: {test_label_counts.get('Not Popular', 0):,} "
      f"({test_label_counts.get('Not Popular', 0)/len(song_perf_test)*100:.1f}%)")

# Show examples of each label type
print(f"\nüìã Example songs:")

print("\n  TRAINING - Popular songs:")
train_popular = song_perf_train[song_perf_train['popularity_label'] == 'Popular'].nlargest(3, 'popularity_score')
for _, row in train_popular.iterrows():
    print(f"    {row['Title'][:40]:40s} | Score: {row['popularity_score']:.1f} | "
          f"Rank: {row['best_rank']:3.0f} | Weeks: {row['total_weeks_charted']:3.0f}")

print("\n  TRAINING - Not Popular songs:")
train_not_popular = song_perf_train[song_perf_train['popularity_label'] == 'Not Popular'].nsmallest(3, 'popularity_score')
for _, row in train_not_popular.iterrows():
    print(f"    {row['Title'][:40]:40s} | Score: {row['popularity_score']:.1f} | "
          f"Rank: {row['best_rank']:3.0f} | Weeks: {row['total_weeks_charted']:3.0f}")

print("\n  TEST - Popular songs:")
test_popular = song_perf_test[song_perf_test['popularity_label'] == 'Popular'].nlargest(3, 'popularity_score')
for _, row in test_popular.iterrows():
    print(f"    {row['Title'][:40]:40s} | Score: {row['popularity_score']:.1f} | "
          f"Rank: {row['best_rank']:3.0f} | Weeks: {row['total_weeks_charted']:3.0f}")

print("\n  TEST - Not Popular songs:")
test_not_popular = song_perf_test[song_perf_test['popularity_label'] == 'Not Popular'].nsmallest(3, 'popularity_score')
for _, row in test_not_popular.iterrows():
    print(f"    {row['Title'][:40]:40s} | Score: {row['popularity_score']:.1f} | "
          f"Rank: {row['best_rank']:3.0f} | Weeks: {row['total_weeks_charted']:3.0f}")

# Save threshold for later use
print(f"\nüíæ Saving threshold for model deployment...")
threshold_info = pd.DataFrame({
    'threshold': [train_threshold],
    'weight_peak': [weight_peak],
    'weight_longevity': [weight_longevity],
    'quantile': [0.7],
    'train_period': ['2017-2021'],
    'test_period': ['2022-2023']
})
threshold_info.to_csv('popularity_threshold.csv', index=False, sep=';')
print(f"  Saved to: popularity_threshold.csv")


STEP 4: CALCULATING POPULARITY SCORES AND LABELS

üìä Processing TRAINING songs (2017-2021)...
  Training threshold (70th percentile): 82.60
  Score range: 3.20 - 100.00

  Score component stats:
    Peak score       - mean: 55.80, std: 29.41
    Longevity score  - mean: 50.95, std: 41.24
    Popularity score - mean: 52.89, std: 33.36

üìä Processing TEST songs (2022-2023)...
  Using training threshold: 82.60
  Score range: 3.20 - 100.00

  Score component stats:
    Peak score       - mean: 59.61, std: 28.41
    Longevity score  - mean: 54.55, std: 41.61
    Popularity score - mean: 56.57, std: 32.76

LABEL DISTRIBUTION

üè∑Ô∏è  TRAINING LABELS (2017-2021):
  Popular:     2,275 (30.0%)
  Not Popular: 5,297 (70.0%)

üè∑Ô∏è  TEST LABELS (2022-2023):
  Popular:     679 (33.3%)
  Not Popular: 1,360 (66.7%)

üìã Example songs:

  TRAINING - Popular songs:
    Se√±orita                                 | Score: 100.0 | Rank:   1 | Weeks:  35
    Blinding Lights                         

In [8]:
# =============================================================================
# STEP 5: CHECK FOR OVERLAP SONGS
# =============================================================================

print("\n" + "="*70)
print("STEP 5: CHECKING FOR SONGS APPEARING IN BOTH PERIODS")
print("="*70)

train_ids = set(song_perf_train['id'])
test_ids = set(song_perf_test['id'])
overlap_ids = train_ids & test_ids

print(f"\nüìä Overlap analysis:")
print(f"  Training-only songs: {len(train_ids - test_ids):,}")
print(f"  Test-only songs:     {len(test_ids - train_ids):,}")
print(f"  Overlap songs:       {len(overlap_ids):,} "
      f"({len(overlap_ids)/(len(train_ids)+len(test_ids)-len(overlap_ids))*100:.1f}% of unique songs)")

if len(overlap_ids) > 0:
    print(f"\n‚ö†Ô∏è  {len(overlap_ids)} songs charted in BOTH periods!")
    print(f"   These are songs that appeared in 2021 and continued into 2022-2023")
    
    # Show examples
    overlap_examples = song_perf_train[song_perf_train['id'].isin(list(overlap_ids)[:5])]
    print(f"\n  Examples of overlap songs:")
    for _, row in overlap_examples.iterrows():
        test_row = song_perf_test[song_perf_test['id'] == row['id']].iloc[0]
        print(f"\n    '{row['Title']}' by {row['Artist']}")
        print(f"      2017-2021: best_rank={row['best_rank']}, weeks={row['total_weeks_charted']}, "
              f"label={row['popularity_label']}")
        print(f"      2022-2023: best_rank={test_row['best_rank']}, weeks={test_row['total_weeks_charted']}, "
              f"label={test_row['popularity_label']}")
    
    print(f"\nüí° DECISION: Assign overlap songs to TRAINING set")
    print(f"   Reason: They first appeared before 2022, so model can learn from them")
    print(f"   This prevents using future (2022-2023) information in training labels")
    
    # Remove from test set
    song_perf_test_clean = song_perf_test[~song_perf_test['id'].isin(overlap_ids)].copy()
    print(f"\n‚úì Test set cleaned:")
    print(f"  Before: {len(song_perf_test):,} songs")
    print(f"  After:  {len(song_perf_test_clean):,} songs")
    print(f"  Removed: {len(overlap_ids):,} songs (moved to training)")
else:
    song_perf_test_clean = song_perf_test.copy()
    print(f"\n‚úÖ No overlap - all songs are period-specific!")


STEP 5: CHECKING FOR SONGS APPEARING IN BOTH PERIODS

üìä Overlap analysis:
  Training-only songs: 7,122
  Test-only songs:     1,589
  Overlap songs:       450 (4.9% of unique songs)

‚ö†Ô∏è  450 songs charted in BOTH periods!
   These are songs that appeared in 2021 and continued into 2022-2023

  Examples of overlap songs:

    'Frosty the Snowman' by The Ronettes
      2017-2021: best_rank=105, weeks=4, label=Not Popular
      2022-2023: best_rank=116, weeks=2, label=Not Popular

    'Man With The Bag' by Jessie J
      2017-2021: best_rank=113, weeks=6, label=Not Popular
      2022-2023: best_rank=158, weeks=2, label=Not Popular

    'My Kind Of Present' by Meghan Trainor
      2017-2021: best_rank=99, weeks=4, label=Not Popular
      2022-2023: best_rank=167, weeks=2, label=Not Popular

    'Christmas (Baby Please Come Home)' by Mariah Carey
      2017-2021: best_rank=31, weeks=74, label=Popular
      2022-2023: best_rank=38, weeks=16, label=Not Popular

    'Jingle Bells - Rem

In [9]:
# =============================================================================
# STEP 6: COMBINE AND SAVE
# =============================================================================

print("\n" + "="*70)
print("STEP 6: COMBINING DATASETS")
print("="*70)

song_perf_train['is_train'] = True
song_perf_test_clean['is_train'] = False

songs_labeled = pd.concat([song_perf_train, song_perf_test_clean], ignore_index=True)

print(f"\n‚úì Combined dataset:")
print(f"  Total songs:  {len(songs_labeled):,}")
print(f"  Training:     {songs_labeled['is_train'].sum():,} "
      f"({songs_labeled['is_train'].sum()/len(songs_labeled)*100:.1f}%)")
print(f"  Test:         {(~songs_labeled['is_train']).sum():,} "
      f"({(~songs_labeled['is_train']).sum()/len(songs_labeled)*100:.1f}%)")

print(f"\n‚úì Overall label distribution:")
overall_labels = songs_labeled['popularity_label'].value_counts()
for label, count in overall_labels.items():
    print(f"  {label:15s}: {count:,} ({count/len(songs_labeled)*100:.1f}%)")

# By train/test split
print(f"\n‚úì Label distribution by split:")
print(f"  Training (2017-2021):")
train_dist = songs_labeled[songs_labeled['is_train']]['popularity_label'].value_counts()
for label, count in train_dist.items():
    print(f"    {label:15s}: {count:,}")

print(f"\n  Test (2022-2023):")
test_dist = songs_labeled[~songs_labeled['is_train']]['popularity_label'].value_counts()
for label, count in test_dist.items():
    print(f"    {label:15s}: {count:,}")

# Verify columns
print(f"\n‚úì Columns in output:")
print(f"  {songs_labeled.columns.tolist()}")

# Save
output_file = 'songs_aggregated_labeled.csv'
songs_labeled.to_csv(output_file, index=False, sep=';')
print(f"\nüíæ Saved to: {output_file}")
print(f"  Shape: {songs_labeled.shape}")


STEP 6: COMBINING DATASETS

‚úì Combined dataset:
  Total songs:  9,161
  Training:     7,572 (82.7%)
  Test:         1,589 (17.3%)

‚úì Overall label distribution:
  Not Popular    : 6,378 (69.6%)
  Popular        : 2,783 (30.4%)

‚úì Label distribution by split:
  Training (2017-2021):
    Not Popular    : 5,297
    Popular        : 2,275

  Test (2022-2023):
    Not Popular    : 1,081
    Popular        : 508

‚úì Columns in output:
  ['id', 'best_rank', 'avg_rank', 'total_weeks_charted', 'first_appearance', 'last_appearance', 'Title', 'Artist', 'peak_score', 'longevity_score', 'popularity_score', 'popularity_label', 'is_train']

üíæ Saved to: songs_aggregated_labeled.csv
  Shape: (9161, 13)


In [10]:
# =============================================================================
# STEP 7: FINAL VERIFICATION
# =============================================================================

print("\n" + "="*70)
print("STEP 7: FINAL VERIFICATION")
print("="*70)

# Check 1: No duplicates
duplicates = songs_labeled['id'].duplicated().sum()
print(f"\n1Ô∏è‚É£  Duplicate check:")
if duplicates > 0:
    print(f"  ‚ùå ERROR: {duplicates} duplicate song IDs!")
else:
    print(f"  ‚úÖ Each song appears exactly once")

# Check 2: No missing values
missing = songs_labeled.isnull().sum()
print(f"\n2Ô∏è‚É£  Missing values:")
if missing.sum() > 0:
    print(f"  ‚ö†Ô∏è  Found missing values:")
    print(missing[missing > 0])
else:
    print(f"  ‚úÖ No missing values")

# Check 3: Temporal integrity
train_songs = songs_labeled[songs_labeled['is_train']]
test_songs = songs_labeled[~songs_labeled['is_train']]

latest_train = train_songs['last_appearance'].max()
earliest_test = test_songs['first_appearance'].min()

print(f"\n3Ô∏è‚É£  Temporal separation:")
print(f"  Latest training song ended:  {latest_train.date()}")
print(f"  Earliest test song started:  {earliest_test.date()}")
print(f"  Gap: {(earliest_test - latest_train).days} days")

if earliest_test.year > latest_train.year:
    print(f"  ‚úÖ Perfect year separation!")
elif earliest_test >= latest_train:
    print(f"  ‚úÖ Clean temporal separation!")
else:
    gap_days = (earliest_test - latest_train).days
    print(f"  ‚ö†Ô∏è  Some overlap of {abs(gap_days)} days (acceptable for year-end songs)")

# Check 4: Label balance
print(f"\n4Ô∏è‚É£  Label balance:")
train_balance = train_songs['popularity_label'].value_counts()
test_balance = test_songs['popularity_label'].value_counts()

train_ratio = train_balance.max() / train_balance.min()
test_ratio = test_balance.max() / test_balance.min()

print(f"  Training imbalance ratio: {train_ratio:.2f}:1")
print(f"  Test imbalance ratio:     {test_ratio:.2f}:1")

if train_ratio > 3 or test_ratio > 3:
    print(f"  ‚ö†Ô∏è  High class imbalance detected - will need to handle in modeling")
else:
    print(f"  ‚úÖ Reasonable class balance")

print("\n" + "="*70)
print("‚úÖ‚úÖ‚úÖ LABELING COMPLETE - NO TEMPORAL LEAKAGE!")
print("="*70)
print("\nDataset Summary:")
print(f"  Training: 2017-2021 ({train_songs['id'].nunique():,} songs)")
print(f"  Test:     2022-2023 ({test_songs['id'].nunique():,} songs)")
print(f"  No overlap between train/test periods")
print(f"  Ready for feature engineering and modeling")
print("="*70)


STEP 7: FINAL VERIFICATION

1Ô∏è‚É£  Duplicate check:
  ‚úÖ Each song appears exactly once

2Ô∏è‚É£  Missing values:
  ‚úÖ No missing values

3Ô∏è‚É£  Temporal separation:
  Latest training song ended:  2021-12-31
  Earliest test song started:  2022-01-01
  Gap: 1 days
  ‚úÖ Perfect year separation!

4Ô∏è‚É£  Label balance:
  Training imbalance ratio: 2.33:1
  Test imbalance ratio:     2.13:1
  ‚úÖ Reasonable class balance

‚úÖ‚úÖ‚úÖ LABELING COMPLETE - NO TEMPORAL LEAKAGE!

Dataset Summary:
  Training: 2017-2021 (7,572 songs)
  Test:     2022-2023 (1,589 songs)
  No overlap between train/test periods
  Ready for feature engineering and modeling
