# Spotify Genre Classifier

## Setup

In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from os import walk, path

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from scipy import stats

## Loading the data

In [None]:
# load data

# get all filenames from the directory data
f = []
for (dirpath, dirnames, filenames) in walk("data"):
    f.extend(filenames)
    break

# load data from all files from the directory data
frames = []
for file in f:
    data = pd.read_json(path.join("data", file))
    frames.append(data)

# concat all data into one dataframe
raw_data = pd.concat(frames, ignore_index=True)

## Visualization

## Preprocessing

In [None]:
numeric_features = ["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms"]
categorical_features = ["key", "mode", "time_signature"]
features = numeric_features + categorical_features

Get the maximum and minimum data value within the boxplot whiskers

In [None]:
iqr_factor = 1.5

def getQuartiles(data: pd.DataFrame) -> set:
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return Q1, Q3, IQR

def getMaxWhiskerValue(data: pd.Series) -> float:
    Q1, Q3, IQR = getQuartiles(data)
    whisker_value = Q3 + (IQR * iqr_factor)
    return whisker_value

def getMinWhiskerValue(data: pd.Series) -> float:
    Q1, Q3, IQR = getQuartiles(data)
    whisker_value = Q1 - (IQR * iqr_factor)
    return whisker_value

Find correlated features where the correlation coefficient is above a specific threshold

In [None]:
def find_correlated_features(data:pd.DataFrame, threshold:float) -> list:
    correlation_matrix = data.corr().abs()
    avg_correlation = correlation_matrix.mean(axis = 1)
    up = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    drop = list()
        
    for row in range(len(up)-1):
        for col in range (row + 1, len(up)):
            if(correlation_matrix.iloc[row, col] > threshold):
                if(avg_correlation.iloc[row] > avg_correlation.iloc[col]): 
                    drop.append(row)
                else: 
                    drop.append(col)
    
    drop = list(set(drop)) 
    dropcols_names = list(data.columns[[item for item in drop]])
    
    return dropcols_names

Remove samples with key == -1

In [None]:
def removeSamplesWithInvalidKey(data: pd.DataFrame) -> None:
    old_len = len(data)
    data = data[data.key != -1]
    print("Samples removed because of invalid key:", old_len - len(data))

Remove samples where time_signature is out of bounds [3:7]

In [None]:
def removeSamplesWithInvalidTimeSignature(data: pd.DataFrame) -> None:
    old_len = len(data)
    data = data[(data.time_signature >= 3) & (data.time_signature <= 7)]
    print("Samples removed because of invalid time_signature:", old_len - len(data))

Create a dictionary which contains **standard deviation, mean, max and min value within whiskers for every feature**

this dictionary is used to preprocess test data in the same way like the training data

Encode categorical features (key & time_signature)


In [None]:
one_hot_encoded_features = ["key", "time_signature"]

def encodeCategoricalFeatures(data: pd.DataFrame) -> list:
    global preprocessing_categorical_features

    enc_features = pd.DataFrame(data[one_hot_encoded_features])

    enc_df = pd.DataFrame(preprocessing_one_hot_encoder.fit_transform(enc_features).toarray(), 
        columns=preprocessing_one_hot_encoder.get_feature_names_out(one_hot_encoded_features), 
        dtype=int,
        index=data.index)
    data.drop(one_hot_encoded_features, axis="columns", inplace=True)

    data = pd.concat([data, enc_df], axis="columns")

    # remove 
    preprocessing_categorical_features = [e for e in preprocessing_categorical_features if e not in one_hot_encoded_features]
    preprocessing_categorical_features = preprocessing_categorical_features + list(enc_df.columns)
    
    return data

def encodeCategoricalFeaturesForTestSet(data: pd.DataFrame):
    global preprocessing_one_hot_encoder
    
    enc_features = pd.DataFrame(data[one_hot_encoded_features])

    enc_df = pd.DataFrame(preprocessing_one_hot_encoder.transform(enc_features).toarray(), 
        columns=preprocessing_one_hot_encoder.get_feature_names_out(one_hot_encoded_features), 
        dtype=int,
        index=data.index)
    data.drop(one_hot_encoded_features, axis="columns", inplace=True)
    data = pd.concat([data, enc_df], axis="columns")

    return data

**Perform all preprocessing steps on the training data**

In [None]:
def preprocessTrainingData(data: pd.DataFrame):
    preprocessing_numeric_features = numeric_features
    preprocessing_categorical_features = categorical_features
    preprocessing_one_hot_encoded_features = ["key", "time_signature"]

    removeSamplesWithInvalidKey(data)
    removeSamplesWithInvalidTimeSignature(data)

    preprocessing_correlated_features = find_correlated_features(data[preprocessing_numeric_features], .8)
    print(f'Drop these correlated features: {preprocessing_correlated_features}')

    data.drop(preprocessing_correlated_features, axis=1, inplace=True)
    preprocessing_numeric_features = [e for e in preprocessing_numeric_features if e not in preprocessing_correlated_features]

    preprocessing_column_transformer = ColumnTransformer([
        ("scaling", StandardScaler(), preprocessing_numeric_features),
        ("one-hot-encoding", OneHotEncoder(), preprocessing_one_hot_encoded_features)
    ], verbose=True, remainder='passthrough')

    transformed = preprocessing_column_transformer.fit_transform(data)

    feature_remainder = [e for e in data.columns if e not in preprocessing_numeric_features and e not in preprocessing_one_hot_encoded_features]  

    preprocessing_categorical_features = [e for e in preprocessing_categorical_features if e not in preprocessing_one_hot_encoded_features]
    one_hot_encoded_features = preprocessing_column_transformer.named_transformers_["one-hot-encoding"].get_feature_names_out(preprocessing_one_hot_encoded_features)
    preprocessing_categorical_features.extend(one_hot_encoded_features)

    preprocessing_transformed_features = preprocessing_numeric_features.copy()
    preprocessing_transformed_features.extend(one_hot_encoded_features)
    preprocessing_transformed_features.extend(feature_remainder)

    data = pd.DataFrame(transformed, index=data.index, columns=preprocessing_transformed_features)

    preprocessing_features_info = {}
    preprocessing_features_info["max_whisker_value"] = {}
    preprocessing_features_info["min_whisker_value"] = {}

    for feature_name in data[preprocessing_numeric_features]:
        max_whisker_value = getMaxWhiskerValue(data[feature_name])
        min_whisker_value = getMinWhiskerValue(data[feature_name])
        preprocessing_features_info["max_whisker_value"][feature_name] = max_whisker_value
        preprocessing_features_info["min_whisker_value"][feature_name] = min_whisker_value

        # set outliers to min/max whisker
        data[feature_name] = data[feature_name].clip(min_whisker_value, max_whisker_value)
        
    preprocessing_features = preprocessing_numeric_features + preprocessing_categorical_features

    print("final features", preprocessing_features)

    test_interface = {
        "preprocessing_categorical_features": preprocessing_categorical_features,
        "preprocessing_numeric_features": preprocessing_numeric_features,
        "preprocessing_features" : preprocessing_features,
        "preprocessing_correlated_features": preprocessing_correlated_features,
        "preprocessing_transformed_features": preprocessing_transformed_features,
        "preprocessing_features_info": preprocessing_features_info,
        "preprocessing_column_transformer": preprocessing_column_transformer
    }

    return data[preprocessing_features], data.playlist_id, test_interface

**Perform all preprocessing steps on the test data**

In [None]:
def preprocessTestData(data: pd.DataFrame, test_interface: dict) -> pd.DataFrame:

    removeSamplesWithInvalidKey(data)
    removeSamplesWithInvalidTimeSignature(data)

    data.drop(test_interface["preprocessing_correlated_features"], axis=1, inplace=True)

    transformed = test_interface["preprocessing_column_transformer"].transform(data)
    data = pd.DataFrame(transformed, columns=test_interface["preprocessing_transformed_features"])

    for feature_name in data[test_interface["preprocessing_numeric_features"]]:
        max_whisker_value = test_interface["preprocessing_features_info"]["max_whisker_value"][feature_name]
        min_whisker_value = test_interface["preprocessing_features_info"]["min_whisker_value"][feature_name]

        # set outliers to min/max whisker
        data[feature_name] = data[feature_name].clip(min_whisker_value, max_whisker_value)
       
    return data[test_interface["preprocessing_features"]]

## Train/test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(raw_data, raw_data.genre, test_size=0.33, stratify=raw_data.genre, random_state=1)

## Perform Preprocessing on Training & Test Set

In [None]:
x_train_preprocessed, x_train_playlists, test_interface  = preprocessTrainingData(x_train)

In [None]:
x_test_preprocessed = preprocessTestData(x_test, test_interface)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(x_train_preprocessed.astype(float).describe())
print(x_test_preprocessed.astype(float).describe())

In [None]:
print(x_test_preprocessed.max())
print(test_interface["preprocessing_features_info"]["max_whisker_value"])

In [None]:
data = {
    "x_train": x_train_preprocessed,
    "x_playlists": x_train_playlists,
    "x_test": x_test,
    "y_train": y_train,
    "y_test": y_test,
    "features": features,
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "target": "genre"
}

## Visualization

### Clean Dataset

In [None]:
raw_data.head()

In [None]:
raw_data.columns

In [None]:
raw_data.columns

In [None]:
raw_data.shape

In [None]:
raw_data.info()

In [None]:
scaler = StandardScaler()
data["x_train"][data["numeric_features"]] = scaler.fit_transform(data["x_train"][data["numeric_features"]])
data["x_train"].describe()

### Outlier removal

In [None]:
#outlier detection using zscore
threshold = 3
filtered_data = pd.DataFrame([])
removed_cnt = 0
train_combined = data["x_train"].copy()
train_combined["genre"] = data["y_train"]

for group_name, group_data in train_combined.groupby("genre"):
    group_data = group_data.drop("genre", 1)

    z_score = group_data.select_dtypes(include='number').apply(stats.zscore)
    filter = (abs(z_score) < threshold).all(axis=1)
    group_filtered = group_data[filter]
    group_filtered['genre'] = group_name

    removed_cnt += (group_data.shape[0] - group_filtered.shape[0])
    filtered_data = pd.concat([filtered_data, group_filtered], ignore_index=False)

print(f"Removed samples: {removed_cnt}")

data["y_train"] = filtered_data["genre"]
data["x_train"] = filtered_data.drop("genre", axis=1)

### Statistics

In [None]:
# if you want to compare statistic informations from specific features you can use: data.groupby('genre').describe()["feature1", "feature2", ...]
data["x_train"].groupby(data["y_train"]).describe()["duration_s"]

In [None]:
data["x_train"].describe()

### Plots

In [None]:
sorted_list = data["y_train"].value_counts().sort_values()
labels = sorted_list.index.tolist()
values = sorted_list.tolist()

plt.bar(labels, values)
plt.title("genre")
plt.ylabel("number of samples")
plt.show()

In [None]:
train_combined = data["x_train"].copy()
train_combined["genre"] = data["y_train"]
for feature in data["features"]:
    sns.boxplot(x="genre", y=feature, data=train_combined)
    plt.show()

In [None]:
data["x_train"].corr()

In [None]:
fig = plt.figure()
heatmap = sns.heatmap(data["x_train"].corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)
fig.set_size_inches(15.5, 10.5, forward=True)

In [None]:
sns.pairplot(data=train_combined, hue="genre")

## Model Selection

### Chi-square Test

In [None]:
chi2_statistics, chi2_p_scores = chi2(data["x_train"][data["categorical_features"]], data["y_train"])
chi2_scores = pd.Series(chi2_statistics, index=data["categorical_features"])
chi2_scores.sort_values(ascending=False).plot.bar()

ANOVA f Test

In [None]:
anova_statistics, anova_p_scores = f_classif(data["x_train"][data["numeric_features"]], data["y_train"])
anova_scores = pd.Series(anova_statistics, index=data["numeric_features"])
anova_scores.sort_values(ascending=False).plot.bar()

### Mutual Information

In [None]:
discrete_features_indices = [list(data["x_train"].columns).index(x) for x in data["categorical_features"]]
mutual_statistics = mutual_info_classif(data["x_train"], data["y_train"], discrete_features=discrete_features_indices)
mutual_scores = pd.Series(mutual_statistics, index=data["features"])
mutual_scores.sort_values(ascending=False).plot.bar()

In [None]:
discrete_features_indices = [list(x_test.columns).index(x) for x in categorical_features]
mutual_statistics = mutual_info_classif(x_train, y_train, discrete_features=discrete_features_indices)
mutual_scores = pd.Series(mutual_statistics, index=x_train.columns)
mutual_scores.sort_values(ascending=False).plot.bar()

### Tree-based feature importances

In [None]:
clf = RandomForestClassifier(n_estimators=1000)
clf = clf.fit(x_train, y_train)

forest_importances = pd.Series(clf.feature_importances_, index=x_train.columns)
fig, ax = plt.subplots()
forest_importances.sort_values(ascending=False).plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()