# Exploratory Data Analysis of Spotify Top 50 Tracks of 2020


## Imports


### Importing Python modules


In [None]:
import pandas as pd
import zipfile
import subprocess

### Downloading compressed dataset of Spotify Top 50 Tracks (2020)


In [None]:
COMMAND_TO_DOWNLOAD_DATASET_OF_TOP_50_SPOTIFY_TRACKS_2020 = (
    "kaggle datasets download --force -d atillacolak/top-50-spotify-tracks-2020"
)

subprocess.run(
    COMMAND_TO_DOWNLOAD_DATASET_OF_TOP_50_SPOTIFY_TRACKS_2020, shell=True, check=True
)

#### Extracting compressed dataset of Spotify Top 50 Tracks (2020)


In [None]:
ZIPPED_DATASET_OF_SPOTIFY_TOP_50_TRACKS_2020 = "top-50-spotify-tracks-2020.zip"

with zipfile.ZipFile(ZIPPED_DATASET_OF_SPOTIFY_TOP_50_TRACKS_2020, "r") as zip_ref:
    zip_ref.extractall(".")
    print("Zipped files extraction has been successful.")

### Importing dataset of Spotify Top 50 Tracks (2020) into memory as a pandas DataFrame


In [47]:
DATASET_OF_SPOTIFY_TOP_50_TRACKS_2020 = "spotifytoptracks.csv"

spotify_top_50_tracks_df = pd.read_csv(
    DATASET_OF_SPOTIFY_TOP_50_TRACKS_2020,
    sep=",",
    index_col=0,
    encoding="utf-8",
    header=0,
    dtype={
        "artist": "category",
        "album": "category",
        "track_name": "category",
        "track_id": "string",
        "energy": "Float32",
        "danceability": "Float32",
        "key": "Float32",
        "loudness": "Float32",
        "acousticness": "Float32",
        "speechiness": "Float32",
        "instrumentalness": "Float32",
        "liveness": "Float32",
        "valence": "Float32",
        "tempo": "Float32",
        "duration_ms": "Int32",
        "genre": "category",
    },
)

### Adding additional 'duration' column


In [None]:
def ms_to_min_sec(ms):
    minutes = int((ms / (1000 * 60)) % 60)
    seconds = int((ms / 1000) % 60)
    return f"{minutes:02}:{seconds:02}"


spotify_top_50_tracks_df["duration"] = spotify_top_50_tracks_df["duration_ms"].apply(
    ms_to_min_sec
)

## Data Cleaning


### Handling missing values


#### Checking whether empty values exist


In [None]:
(
    spotify_top_50_tracks_df.isna()
    .any()
    .copy()
    .rename("missing_values_exist_in_columns")
    .rename_axis("df_columns")
)

**_There is no missing values in 'spotify_top_50_tracks_df' data._**


### Removing duplicate samples and features


#### Checking whether duplicate features exist


In [None]:
spotify_top_50_tracks_df.columns.duplicated().any()

**_There is no duplicate features in 'spotify_top_50_tracks_df' data._**


#### Checking whether duplicate samples exist


In [None]:
spotify_top_50_tracks_df.duplicated(subset=["track_id"]).any()

**_There is no duplicate samples in 'spotify_top_50_tracks_df' data._**


### Treating the outliers


#### Checking whether numerical features contain negative numbers


In [None]:
numerical_columns = spotify_top_50_tracks_df.select_dtypes("number").columns.tolist()

(
    spotify_top_50_tracks_df[numerical_columns]
    .apply(lambda col: (col < 0))
    .any()
    .rename("negative_numerical_values_exist")
    .rename_axis("df_columns")
)

**_There is no negative values in 'spotify_top_50_tracks_df' data except in 'loudness' column data which is expected._**


In [None]:
((spotify_top_50_tracks_df["loudness"] < 0).all())

**_All the 'loudness' column values are negative in 'spotify_top_50_tracks_df' data, which is expected, because negative values indicate how many decibels below the maximum loudness the track is._**


#### Checking whether normalized features are truly scaled between 0 and 1


In [None]:
normalized_columns = [
    "energy",
    "danceability",
    "acousticness",
    "speechiness",
    "instrumentalness",
    "liveness",
    "valence",
]

(
    spotify_top_50_tracks_df[normalized_columns]
    .apply(lambda col: col.between(0, 1))
    .all()
    .rename("feature_values_are_normalized")
    .rename_axis("df_columns")
)

**_All normalized features contain values between 0 and 1_**


## Data Analysis


### Dataset Overview


#### How many observations are there in this dataset?


In [None]:
num_of_observations = spotify_top_50_tracks_df.shape[0]

print(f"Number of observation: {num_of_observations}")

#### How many features this dataset has?


In [None]:
num_of_features = spotify_top_50_tracks_df.shape[1]

print(f"Number of features: {num_of_features}")

#### Which of the features are categorical?


In [None]:
categorical_features = spotify_top_50_tracks_df.select_dtypes(
    "category"
).columns.tolist()

print("Categorical features:", *categorical_features, sep="\n • ")

#### Which of the features are numeric?


In [None]:
numerical_features = spotify_top_50_tracks_df.select_dtypes("number").columns.tolist()

print("Numerical features:", *numerical_features, sep="\n • ")

### Artists


#### Are there any artists that have more than 1 popular track? If yes, which and how many?


In [None]:
artist_track_counts = spotify_top_50_tracks_df.groupby("artist", observed=True).agg(
    {"track_name": "count"},
)

artists_with_multiple_tracks = (
    artist_track_counts[artist_track_counts["track_name"] > 1]
    .reset_index()
    .rename(columns={"track_name": "track_count"})
)


print(
    "Artists having more than 1 track in Spotify's Top 50 (2020):",
)

artists_with_multiple_tracks

#### Who was the most popular artist?


In [None]:
def find_most_popular_artists(
    artists_df: pd.DataFrame, track_count_col: str, artist_col: str
):
    max_count = artists_df[track_count_col].max()
    most_popular_artists = artists_df.loc[
        artists_df[track_count_col] == max_count, artist_col
    ].tolist()
    return most_popular_artists


most_popular_artists_list = find_most_popular_artists(
    artists_with_multiple_tracks, "track_count", "artist"
)

print(
    "Most popular artist(s) in Spotify's Top 50 (2020):",
    ", ".join(most_popular_artists_list),
)

#### How many artists in total have their songs in the top 50?


In [None]:
num_of_artists_with_multiple_tracks = artists_with_multiple_tracks["artist"].count()

print(
    "Count of artists with multiple tracks in Spotify's Top 50 (2020):",
    num_of_artists_with_multiple_tracks,
)

### Albums


#### Are there any albums that have more than 1 popular track? If yes, which and how many?


In [None]:
album_track_counts = spotify_top_50_tracks_df.groupby("album", observed=True).agg(
    {"track_name": "count"},
)

albums_with_multiple_tracks = album_track_counts[
    album_track_counts["track_name"] > 1
].reset_index()

print(
    "Albums having more than 1 track in Spotify's Top 50 (2020):",
)

albums_with_multiple_tracks

#### How many albums in total have their songs in the top 50?


In [None]:
num_of_albums_with_multiple_tracks = albums_with_multiple_tracks["album"].count()

print(
    "Count of albums with multiple tracks in Spotify's Top 50 (2020):",
    num_of_albums_with_multiple_tracks,
)

### Tracks


#### Which tracks have a danceability score above 0.7?


In [None]:
print(
    "Tracks with danceability score above 0.7 in Spotify's Top 50 (2020):",
)

spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["danceability"] > 0.7, ["track_name", "danceability"]
].reset_index(drop=True).sort_values("danceability")

#### Which tracks have a danceability score below 0.4?


In [None]:
print(
    "Tracks with danceability score below 0.4 in Spotify's Top 50 (2020):",
)

spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["danceability"] < 0.4, ["track_name", "danceability"]
]

#### Which tracks have their loudness above -5?


In [None]:
print(
    "Tracks with loudness above -5 in Spotify's Top 50 (2020):",
)

spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["loudness"] > -5, ["track_name", "loudness"]
]

#### Which tracks have their loudness below -8?


In [None]:
print(
    "Tracks with loudness below -8 in Spotify's Top 50 (2020):",
)

spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["loudness"] < (-8), ["track_name", "loudness"]
]

#### Which track is the shortest?


In [None]:
min_duration = spotify_top_50_tracks_df["duration_ms"].min()
shortest_track_df = spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["duration_ms"] == min_duration,
    ["track_name", "duration_ms"],
]

print(
    "The shortest track in Spotify's Top 50 (2020):",
)
shortest_track_df

#### Which track is the longest?


In [None]:
max_duration = spotify_top_50_tracks_df["duration_ms"].max()
longest_track_df = spotify_top_50_tracks_df.loc[
    spotify_top_50_tracks_df["duration_ms"] == max_duration,
    ["track_name", "duration_ms"],
]

print(
    "The longest track in Spotify's Top 50 (2020):",
)
longest_track_df

### Genres


#### Which genre is the most popular?


In [None]:
most_popular_genre = spotify_top_50_tracks_df["genre"].value_counts()

print(
    "The most popular genre in Spotify's Top 50 (2020):" + "\n",
    most_popular_genre.nlargest(3),
)

#### Which genres have just one song on the top 50?


In [None]:
genres_count = spotify_top_50_tracks_df["genre"].value_counts().reset_index()

print("The genres having a single song in Spotify's Top 50 (2020):")

genres_count[genres_count["count"] == 1]

#### How many genres in total are represented in the top 50?


In [None]:
num_of_separate_genres = genres_count["genre"].count()

print("Number of different genres in Spotify's Top 50 (2020):", num_of_separate_genres)

### Feature scores compared by genres


**_Function to compare feature scores of specified genres for 3 tasks/questions below._**


In [None]:
GENRES_OF_INTEREST = ["Pop", "Hip-Hop/Rap", "Dance/Electronic", "Alternative/Indie"]


def compare_feature_scores_of_genres(
    genres_df: pd.DataFrame,
    feature_score_col: str,
    genres_of_interest: list = GENRES_OF_INTEREST,
) -> pd.DataFrame:
    filtered_genres_df = genres_df.loc[
        genres_df["genre"].isin(genres_of_interest), ["genre", feature_score_col]
    ]

    aggregations = {
        feature_score_col + "_mean": (feature_score_col, "mean"),
    }

    print(f"Comparison of genres by '{feature_score_col}' score:")
    return (
        filtered_genres_df.groupby(by="genre", observed=True)
        .agg(**aggregations)
        .sort_values(by=feature_score_col + "_mean", ascending=False)
        .reset_index()
    )

#### How does the danceability score compare between Pop, Hip-Hop/Rap, Dance/Electronic, and Alternative/Indie genres?


In [None]:
compare_feature_scores_of_genres(
    spotify_top_50_tracks_df,
    "danceability",
)

#### How does the loudness score compare between Pop, Hip-Hop/Rap, Dance/Electronic, and Alternative/Indie genres?


In [None]:
compare_feature_scores_of_genres(
    spotify_top_50_tracks_df,
    "loudness",
)

#### How does the acousticness score compare between Pop, Hip-Hop/Rap, Dance/Electronic, and Alternative/Indie genres?


In [None]:
compare_feature_scores_of_genres(
    spotify_top_50_tracks_df,
    "acousticness",
)

### Correlations


**_Function to find correlations between features of the same dataset._**


In [64]:
def find_correlations_between_features(
    df: pd.DataFrame, correlation_type: str
) -> pd.DataFrame:
    """
    Function to find correlations between features of the same dataset based on the pearson correlation criteria.
    Reference: https://www.semanticscholar.org/paper/Optimizing-Threshold-using-Pearson-Correlation-for-Sabilla-Sarno/68d22a340cec834e3d76b969b46012503abb3b08
    """
    # These thresholds are simplified a bit and do not include 'very weak', 'very strong', 'near perfect' correlations
    thresholds = {
        "strong negative": (-1.00, -0.50),
        "moderate negative": (-0.49, -0.30),
        "weak negative": (-0.29, -0.01),
        "not correlated": (-0.009, 0.009),
        "weak positive": (0.01, 0.29),
        "moderate positive": (0.30, 0.49),
        "strong positive": (0.50, 1.00),
    }

    if correlation_type not in thresholds:
        raise ValueError(
            "Invalid correlation type. Choose from 'strong negative', 'moderate negative', 'weak negative', 'not correlated', 'weak positive', 'moderate positive', or 'strong positive'."
        )

    correlation_matrix = df.corr(method="pearson", numeric_only=True)

    lower, upper = thresholds[correlation_type]
    if correlation_type == "not correlated":
        filtered_correlations = correlation_matrix[(correlation_matrix.abs() < upper)]
    else:
        filtered_correlations = correlation_matrix[
            (correlation_matrix > lower) & (correlation_matrix < upper)
        ]

    return filtered_correlations.dropna(how="all").dropna(axis=1, how="all")

#### Which features are strongly positively correlated?


In [None]:
find_correlations_between_features(spotify_top_50_tracks_df, "strong positive")

#### Which features are strongly negatively correlated?


In [None]:
find_correlations_between_features(spotify_top_50_tracks_df, "strong negative")

#### Which features are not correlated?


In [None]:
find_correlations_between_features(spotify_top_50_tracks_df, "not correlated")

## Suggestions for data analysis improvement


- The code could be more uniform; some code blocks contain repeated logic that could be extracted into functions for better reusability and maintainability.
- Some more descriptive statistics (std, median, quantile
  ) for numerical features could be added to improve EDA.
- Data visualization could be added to improve the readability and insights of the EDA.
