# Data Preprocessing Notebook

This notebook shows the preprocessing steps taken to create a 2 Mio Spotify songs dataset

#### Datasets

All Datasets can be found in the data folder

- https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks (original_spotify_data/1mio_dataset)
- https://www.kaggle.com/datasets/tonygordonjr/spotify-dataset-2023?select=spotify-albums_data_2023.csv (original_spotify_data/2023_dataset)
- https://www.kaggle.com/datasets/yamaerenay/spotify-dataset-19212020-600k-tracks?select=tracks.csv (original_spotify_data/600k_dataset)

# Loading all Datasets

## 1 Million Songs Dataset
https://www.kaggle.com/datasets/amitanshjoshi/spotify-1million-tracks

In [None]:
from custom_utils import load_and_concatenate_parquet_files
one_mio_dataset = load_and_concatenate_parquet_files("data/original_spotify_data/1mio_dataset")
print(one_mio_dataset.shape)
display(one_mio_dataset.head())
print(one_mio_dataset.isna().sum())
print(one_mio_dataset.info())

In [2]:
one_mio_dataset = one_mio_dataset.drop(columns=["Unnamed: 0"])

In [None]:
one_mio_dataset = one_mio_dataset.dropna()
print(one_mio_dataset.isna().sum())
print(one_mio_dataset.shape)

## 600k Tracks Dataset
https://www.kaggle.com/datasets/yamaerenay/spotify-dataset-19212020-600k-tracks?select=tracks.csv

In [None]:
from custom_utils import load_and_concatenate_parquet_files
half_mio_dataset = load_and_concatenate_parquet_files("data/original_spotify_data/600k_dataset")
print(half_mio_dataset.shape)
display(half_mio_dataset.head())
print(half_mio_dataset.isna().sum())
print(half_mio_dataset.info())

In [None]:
half_mio_dataset = half_mio_dataset.dropna()
print(half_mio_dataset.isna().sum())
print(half_mio_dataset.shape)

In [None]:
display(one_mio_dataset.head(1))
display(half_mio_dataset.head(1))

In [None]:
half_mio_dataset = half_mio_dataset.rename(columns={"id": "track_id", "name": "track_name"})
display(half_mio_dataset.head(1))

In [None]:
half_mio_dataset = half_mio_dataset.drop(columns=["explicit", "id_artists"])
display(half_mio_dataset.head(1))

In [None]:
import numpy as np
import re
def extract_year(date_str):
    date_str = str(date_str).strip()
    match = re.match(r'^(\d{4})', date_str)
    if match:
        return int(match.group(1))
    return np.nan
    
half_mio_dataset['year'] = half_mio_dataset['release_date'].apply(extract_year)
display(half_mio_dataset[["year", "release_date"]].head(5))
half_mio_dataset = half_mio_dataset.drop(columns=["release_date"])

In [None]:
import pandas as pd
import numpy as np
import re
def extract_first_artist(artists_str):

    clean_str = re.sub(r'[\[\]\']', '', artists_str).strip()
    artists_list = [name.strip() for name in clean_str.split(',')]
    return artists_list[0] if artists_list else np.nan

half_mio_dataset['artist_name'] = half_mio_dataset['artists'].apply(extract_first_artist)
display(half_mio_dataset[["artist_name", "artists"]].head(5))
half_mio_dataset = half_mio_dataset.drop(columns=["artists"])

In [11]:
half_mio_dataset["genre"] = None
half_mio_dataset = half_mio_dataset[one_mio_dataset.columns]

In [None]:
display(half_mio_dataset.head(1))
display(one_mio_dataset.head(1))

## 2023 Dataset
https://www.kaggle.com/datasets/tonygordonjr/spotify-dataset-2023?select=spotify-albums_data_2023.csv

In [None]:
from custom_utils import load_and_concatenate_parquet_files
newest_dataset = load_and_concatenate_parquet_files("data/original_spotify_data/2023_dataset")
print(newest_dataset.shape)
display(newest_dataset.head())
print(newest_dataset.isna().sum())
print(newest_dataset.info())

In [None]:
newest_dataset = newest_dataset[["artist_0", "track_name", "track_id", "release_year", "track_popularity", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature", "genre_0"]]
display(newest_dataset.head(1))

In [None]:
newest_dataset = newest_dataset.rename(columns={"artist_0": "artist_name", "release_year": "year", "genre_0": "genre", "track_popularity": "popularity"})
display(newest_dataset.head(1))

In [None]:
print(newest_dataset["genre"].value_counts())

In [17]:
newest_dataset = newest_dataset.drop(columns=["genre"])

In [None]:
print(newest_dataset.isna().sum())
newest_dataset = newest_dataset.dropna()

In [19]:
newest_dataset["genre"] = None
newest_dataset["year"] = newest_dataset["year"].astype(int)
newest_dataset["time_signature"] = newest_dataset["time_signature"].astype(int)
newest_dataset["duration_ms"] = newest_dataset["duration_ms"].astype(int)
newest_dataset["key"] = newest_dataset["key"].astype(int)
newest_dataset["mode"] = newest_dataset["mode"].astype(int)
newest_dataset["popularity"] = newest_dataset["popularity"].astype(int)

In [None]:
newest_dataset = newest_dataset[one_mio_dataset.columns]
display(newest_dataset.head(1))
display(one_mio_dataset.head(1))
display(half_mio_dataset.head(1))

## Combining all datasets

In [None]:
combined_dataset = pd.concat([one_mio_dataset, half_mio_dataset, newest_dataset], ignore_index=True)
combined_dataset = combined_dataset.reset_index(drop=True)
display(combined_dataset.head(5))
print(combined_dataset.shape)
print(combined_dataset.isna().sum())
print(combined_dataset.info())

### Removing duplicate songs

In [None]:
print(combined_dataset.shape)
combined_dataset = combined_dataset.drop_duplicates(subset=["track_name", "artist_name", "danceability", "energy", "key", "valence", "tempo", "time_signature"])
combined_dataset = combined_dataset.drop_duplicates(subset=["track_id"])
combined_dataset = combined_dataset.drop_duplicates(subset=["track_name", "artist_name"])
print(combined_dataset.shape)

In [23]:
from custom_utils import save_dataframe_as_parquet
# save_dataframe_as_parquet(combined_dataset, folder_path="data", folder_name="preprocessed_spotify_data", always_overwrite=True)