In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
wardabilal_spotify_global_music_dataset_20092025_path = kagglehub.dataset_download('wardabilal/spotify-global-music-dataset-20092025')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/spotify-global-music-dataset-20092025/spotify_data clean.csv")
df.sample(6)


In [None]:
df.info()

In [None]:
df.describe()

# Feature Engineering - Date & Track Age

In [None]:
import matplotlib.pyplot as plt
df['album_release_date'] = pd.to_datetime(df['album_release_date'], errors='coerce')
current_year = 2024
df['release_year'] = df['album_release_date'].dt.year
df['track_age_raw'] = current_year - df['release_year']

plt.figure(figsize=(10, 6))
df['track_age_raw'].loc[df['track_age_raw'] < 100].hist(bins=20, color='#6A0DAD', edgecolor='black', alpha=0.8)
median_track_age = df['track_age_raw'].median()
plt.axvline(median_track_age, color='r', linestyle='--', linewidth=2, label=f'Median Age: {median_track_age:.1f} years')
plt.title('Distribution of Track Age (Justifying Imputation)', fontsize=16)
plt.xlabel('Track Age (Years)', fontsize=12)
plt.ylabel('Number of Tracks', fontsize=12)
plt.legend()
plt.tight_layout()
plt.show()

#Action
df['track_age'] = df['track_age_raw'].fillna(median_track_age)
df.drop(columns=['track_age_raw', 'release_year'], inplace=True, errors='ignore')

 # Feature Engineering - Genre & Target

In [None]:
df['artist_genres'] = df['artist_genres'].astype(str)
df['primary_genre'] = df['artist_genres'].apply(
    lambda x: x.split(',')[0].strip().replace("['", "").replace("']", "") if x not in ('nan', 'N/A') else 'unknown'
).str.lower()
df.loc[df['primary_genre'] == '', 'primary_genre'] = 'unknown'

genre_counts = df['primary_genre'].value_counts()
plt.figure(figsize=(14, 7))
genre_counts.nlargest(50).plot(kind='bar', color='#1DB954', alpha=0.9)
plt.title('Frequency of Top 50 Primary Genres (Justifying Grouping)', fontsize=16)
plt.xlabel('Primary Genre', fontsize=12)
plt.ylabel('Number of Tracks', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

#Action
top_n = 20
top_genres = genre_counts.nlargest(top_n).index
df['primary_genre_grouped'] = np.where(df['primary_genre'].isin(top_genres),
                                      df['primary_genre'], 'other_genre')

df['is_highly_popular'] = (df['track_popularity'] > 75).astype(int)

# Exploratory Data Analysis (EDA)

In [None]:
#1) Top 10 Primary Genres
top_10_genres = df['primary_genre'].value_counts()[1:11]
plt.figure(figsize=(12, 6))
top_10_genres.sort_values(ascending=True).plot(kind='barh', color='#1DB954')
plt.title('Top 10 Primary Genres by Track Count', fontsize=16)
plt.xlabel('Number of Tracks', fontsize=12)
plt.ylabel('Primary Genre', fontsize=12)
plt.tight_layout()
plt.show()

#2) Track Popularity vs. Log of Artist Followers (Bivariate Analysis)
df['artist_followers'] = pd.to_numeric(df['artist_followers'], errors='coerce')
df.dropna(subset=['artist_followers', 'track_popularity'], inplace=True)
df['log_followers'] = np.log1p(df['artist_followers'])

plt.figure(figsize=(10, 7))
plt.scatter(df['log_followers'], df['track_popularity'], alpha=0.2, color='#191414', s=10)
plt.title('Track Popularity vs. Log of Artist Followers', fontsize=16)
plt.xlabel('Artist Followers (Logarithmic Scale)', fontsize=12)
plt.ylabel('Track Popularity Score (0-100)', fontsize=12)
plt.tight_layout()
plt.show()

#3) Top 10 Most Popular Tracks (Ranking)
top_tracks = df.drop_duplicates(subset=['track_id']).sort_values('track_popularity', ascending=False).head(10)
plt.figure(figsize=(10, 7))
plt.barh(top_tracks['track_name'][::-1], top_tracks['track_popularity'][::-1], color='#50B594')
plt.title('Top 10 Most Popular Tracks', fontsize=16)
plt.xlabel('Track Popularity Score (0-100)', fontsize=12)
plt.ylabel('Track Name', fontsize=12)
plt.xlim(0, 100)
plt.tight_layout()
plt.show()

# [](http://)Model Training (3 Models)
What is it predicting:
the model predicts the binary outcome of whether a track will achieve High Popularity, which was defined by the project as having a Spotify Popularity Score greater than 75.

It is a Binary Classification model designed to solve a business problem: identifying potential hit songs early.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
#Features selected for the final model
features_to_use = [
    'track_duration_min', 'explicit', 'artist_popularity', 'artist_followers',
    'album_total_tracks', 'album_type', 'track_age', 'primary_genre_grouped'
]
target = 'is_highly_popular'

#Final data selection and cleaning
data = df[features_to_use + [target]].copy()
data.dropna(inplace=True)

#Converting 'explicit' column to 0/1 integer
data['explicit'] = data['explicit'].astype(str).str.upper().map({'TRUE': 1, 'FALSE': 0, '1': 1, '0': 0}).fillna(0).astype(int)

#One-Hot Encoding
X = pd.get_dummies(data.drop(columns=[target]), drop_first=True)
y = data[target]

#Train-Test Split (stratify=y is essential for balanced split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Training the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
model.fit(X_train, y_train)

print(f"Random Forest Classifier trained on {len(X_train)} samples.")


#Xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


features_to_use = [
    'track_duration_min', 'explicit', 'artist_popularity', 'artist_followers',
    'album_total_tracks', 'album_type', 'track_age', 'primary_genre_grouped'
]
target = 'is_highly_popular'

data = df[features_to_use + [target]].copy()
data.dropna(inplace=True)
data['explicit'] = data['explicit'].astype(str).str.upper().map({'TRUE': 1, 'FALSE': 0, '1': 1, '0': 0}).fillna(0).astype(int)

X = pd.get_dummies(data.drop(columns=[target]), drop_first=True)
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

neg_count = y_train.value_counts()[0]
pos_count = y_train.value_counts()[1]
scale_pos_weight = neg_count / pos_count

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    learning_rate=0.1
)

xgb_model.fit(X_train, y_train)

print(f"XGBoost Classifier trained on {len(X_train)} samples.")
print(f"Scale position weight used: {scale_pos_weight:.2f}")

In [None]:
y_pred = model.predict(X_test)
print("\n--- MODEL PERFORMANCE REPORT (Random Forest Classifier) ---")
print(classification_report(y_test, y_pred, target_names=['Low Popularity (0)', 'High Popularity (1)']))



from sklearn.metrics import classification_report
y_pred_xgb = xgb_model.predict(X_test)
print("\n---XGBoost MODEL PERFORMANCE REPORT ---")
print(classification_report(y_test, y_pred_xgb, target_names=['Low Popularity (0)', 'High Popularity (1)']))

In [None]:
import pandas as pd
feature_importances_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
top_10_features_xgb = feature_importances_xgb.nlargest(10).sort_values(ascending=False)
print("\nTop 10 Feature Importances")
print(top_10_features_xgb.to_markdown(numalign="left", stralign="left"))


# Summary
The pipeline took raw Spotify data and performed four key steps:
**Feature Engineering:** Calculated Track Age and grouped Artist Genres to prepare the data.

**Target Definition:** Created a binary target, classifying tracks as a "Hit" (Popularity $> 75$) or "Non-Hit".

**Modeling & Optimization:** Compared high-performance ensemble models (Random Forest and XGBoost), using weighted methods to handle the scarcity of "Hit" tracks.

**Key Findings:** The models achieved high overall accuracy ($\approx 84\%$) and confirmed that Artist Followers, Artist Popularity, and Track Duration are the primary drivers of success.

The pipeline is highly valuable for those who is studying Data Science, Machine Learning, and Business Analytics, particularly those focusing on the music or media industries.