# Music Genre Prediction

In [1]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [70]:
df = pd.read_csv("../data/music_mental_health/mxmh_survey_results.csv")
df.head()

Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


In [71]:
df.describe()

Unnamed: 0,Age,Hours per day,BPM,Anxiety,Depression,Insomnia,OCD
count,735.0,736.0,629.0,736.0,736.0,736.0,736.0
mean,25.206803,3.572758,1589948.0,5.837636,4.796196,3.738451,2.637228
std,12.05497,3.028199,39872610.0,2.793054,3.02887,3.088689,2.842017
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,2.0,100.0,4.0,2.0,1.0,0.0
50%,21.0,3.0,120.0,6.0,5.0,3.0,2.0
75%,28.0,5.0,144.0,8.0,7.0,6.0,5.0
max,89.0,24.0,1000000000.0,10.0,10.0,10.0,10.0


Some things that are more obvious that we won't need so we can drop them now. Keep in mind that all permissions values are that of respondents agreeing to their data being public. We will also drop insomnia and OCD because I don't have a way to measure these for user inputs.

In [72]:
df = df.drop(columns = ['Timestamp', 'Primary streaming service', 'BPM', 'OCD', 'Insomnia', 'Permissions'], axis = 1) 
df.head()

Unnamed: 0,Age,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,Frequency [Classical],Frequency [Country],...,Frequency [Lofi],Frequency [Metal],Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Music effects
0,18.0,3.0,Yes,Yes,Yes,Latin,Yes,Yes,Rarely,Never,...,Rarely,Never,Very frequently,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,
1,63.0,1.5,Yes,No,No,Rock,Yes,No,Sometimes,Never,...,Rarely,Never,Sometimes,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,
2,18.0,4.0,No,No,No,Video game music,No,Yes,Never,Never,...,Sometimes,Sometimes,Rarely,Never,Rarely,Rarely,Very frequently,7.0,7.0,No effect
3,61.0,2.5,Yes,No,Yes,Jazz,Yes,Yes,Sometimes,Never,...,Sometimes,Never,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,Improve
4,18.0,4.0,Yes,No,No,R&B,Yes,No,Never,Never,...,Sometimes,Never,Sometimes,Very frequently,Very frequently,Never,Rarely,7.0,2.0,Improve


In [73]:
df.shape

(736, 27)

In [74]:
Counter(df['Fav genre'])

Counter({'Latin': 3,
         'Rock': 188,
         'Video game music': 44,
         'Jazz': 20,
         'R&B': 35,
         'K pop': 26,
         'Country': 25,
         'EDM': 37,
         'Hip hop': 35,
         'Pop': 114,
         'Rap': 22,
         'Classical': 53,
         'Metal': 88,
         'Folk': 30,
         'Lofi': 10,
         'Gospel': 6})

In [75]:
df_improve = df[df['Music effects'] == 'Improve']
df_improve.head()

Unnamed: 0,Age,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,Frequency [Classical],Frequency [Country],...,Frequency [Lofi],Frequency [Metal],Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Music effects
3,61.0,2.5,Yes,No,Yes,Jazz,Yes,Yes,Sometimes,Never,...,Sometimes,Never,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,Improve
4,18.0,4.0,Yes,No,No,R&B,Yes,No,Never,Never,...,Sometimes,Never,Sometimes,Very frequently,Very frequently,Never,Rarely,7.0,2.0,Improve
5,18.0,5.0,Yes,Yes,Yes,Jazz,Yes,Yes,Rarely,Sometimes,...,Very frequently,Rarely,Very frequently,Very frequently,Very frequently,Very frequently,Never,8.0,8.0,Improve
6,18.0,3.0,Yes,Yes,No,Video game music,Yes,Yes,Sometimes,Never,...,Rarely,Rarely,Rarely,Rarely,Never,Never,Sometimes,4.0,8.0,Improve
7,21.0,1.0,Yes,No,No,K pop,Yes,Yes,Never,Never,...,Sometimes,Never,Sometimes,Sometimes,Rarely,Never,Rarely,5.0,3.0,Improve


In [95]:
df_clean = df_improve.copy()

In [96]:
df_clean['while_working'] = df_improve['While working'].apply(lambda x: 1 if x == 'Yes' else 0)
df_clean['instrumentalist'] = df_improve['Instrumentalist'].apply(lambda x: 1 if x == 'Yes' else 0)
df_clean['composer'] = df_improve['Composer'].apply(lambda x: 1 if x == 'Yes' else 0)
df_clean['fav_genre'] = df_improve['Fav genre'].apply(lambda x: 1 if x == 'Yes' else 0)
df_clean['exploratory'] = df_improve['Exploratory'].apply(lambda x: 1 if x == 'Yes' else 0)
df_clean['foreign_languages'] = df_improve['Foreign languages'].apply(lambda x: 1 if x == 'Yes' else 0)

In [97]:
df_clean.drop(columns = ['Hours per day', 'While working', 'Instrumentalist', 'Composer', 'Exploratory', 'Foreign languages'], 
              axis = 1, inplace = True)

In [98]:
df_clean.head()

Unnamed: 0,Age,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],...,Frequency [Video game music],Anxiety,Depression,Music effects,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages
3,61.0,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,Sometimes,...,Never,9.0,7.0,Improve,1,0,1,0,1,1
4,18.0,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,Very frequently,...,Rarely,7.0,2.0,Improve,1,0,0,0,1,0
5,18.0,Jazz,Rarely,Sometimes,Never,Never,Never,Sometimes,Very frequently,Very frequently,...,Never,8.0,8.0,Improve,1,1,1,0,1,1
6,18.0,Video game music,Sometimes,Never,Rarely,Sometimes,Rarely,Rarely,Sometimes,Never,...,Sometimes,4.0,8.0,Improve,1,1,0,0,1,1
7,21.0,K pop,Never,Never,Rarely,Never,Never,Very frequently,Rarely,Very frequently,...,Rarely,5.0,3.0,Improve,1,0,0,0,1,1


In [99]:
df_clean['age'] = df_improve['Age']
df_clean['fav_genre'] = df_improve['Fav genre']

df_clean.drop(columns = ['Age', 'Fav genre', 'Music effects'], 
              axis = 1, inplace = True)
                         
df_clean.head()

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Latin],Frequency [Lofi],...,Frequency [Video game music],Anxiety,Depression,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,age
3,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,Sometimes,Very frequently,Sometimes,...,Never,9.0,7.0,1,0,1,Jazz,1,1,61.0
4,Never,Never,Rarely,Never,Rarely,Very frequently,Never,Very frequently,Sometimes,Sometimes,...,Rarely,7.0,2.0,1,0,0,R&B,1,0,18.0
5,Rarely,Sometimes,Never,Never,Never,Sometimes,Very frequently,Very frequently,Rarely,Very frequently,...,Never,8.0,8.0,1,1,1,Jazz,1,1,18.0
6,Sometimes,Never,Rarely,Sometimes,Rarely,Rarely,Sometimes,Never,Rarely,Rarely,...,Sometimes,4.0,8.0,1,1,0,Video game music,1,1,18.0
7,Never,Never,Rarely,Never,Never,Very frequently,Rarely,Very frequently,Never,Sometimes,...,Rarely,5.0,3.0,1,0,0,K pop,1,1,21.0


In [106]:
freq_map = {'Sometimes': 2, 'Never': 0, 'Rarely': 1, 'Very frequently': 3}

df_clean['Frequency [Classical]'] = df_improve['Frequency [Classical]'].map(freq_map)


In [114]:
for genre in df_clean.columns[0:16]:
    df_clean[genre] = df_improve[genre].map(freq_map)

In [115]:
df_clean.head()

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Latin],Frequency [Lofi],...,Frequency [Video game music],Anxiety,Depression,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,age
3,2,0,0,1,2,0,3,2,3,2,...,0,9.0,7.0,1,0,1,Jazz,1,1,61.0
4,0,0,1,0,1,3,0,3,2,2,...,1,7.0,2.0,1,0,0,R&B,1,0,18.0
5,1,2,0,0,0,2,3,3,1,3,...,0,8.0,8.0,1,1,1,Jazz,1,1,18.0
6,2,0,1,2,1,1,2,0,1,1,...,2,4.0,8.0,1,1,0,Video game music,1,1,18.0
7,0,0,1,0,0,3,1,3,0,2,...,1,5.0,3.0,1,0,0,K pop,1,1,21.0


In [118]:
Counter(df_clean['fav_genre'])

Counter({'Jazz': 16,
         'R&B': 26,
         'Video game music': 26,
         'K pop': 19,
         'Rock': 126,
         'EDM': 30,
         'Hip hop': 31,
         'Country': 20,
         'Rap': 17,
         'Pop': 85,
         'Metal': 67,
         'Classical': 39,
         'Folk': 23,
         'Lofi': 10,
         'Gospel': 6,
         'Latin': 1})

There is only one category with Latin so let's get rid of it to make things easier.

In [122]:
df_clean = df_clean[df_clean['fav_genre'] != 'Latin']

This is still too many genres with too little data, since we are looking at emotion in this problem, let's group them into the categories: 

Calm/Relaxing: Jazz, Classical, Lofi

Energetic/Upbeat: Pop, Rock, EDM, Hip hop, Rap, Metal

Soulful/Reflective: R&B, Country, Folk, Gospel

Diverse/Varied: Video game music, K pop

In [123]:
# a map to do this
genre_mapping = {
    'Jazz': 'Calm/Relaxing',
    'R&B': 'Soulful/Reflective',
    'Video game music': 'Diverse/Varied',
    'K pop': 'Diverse/Varied',
    'Rock': 'Energetic/Upbeat',
    'EDM': 'Energetic/Upbeat',
    'Hip hop': 'Energetic/Upbeat',
    'Country': 'Soulful/Reflective',
    'Rap': 'Energetic/Upbeat',
    'Pop': 'Energetic/Upbeat',
    'Metal': 'Energetic/Upbeat',
    'Classical': 'Calm/Relaxing',
    'Folk': 'Soulful/Reflective',
    'Lofi': 'Calm/Relaxing',
    'Gospel': 'Soulful/Reflective'
}

In [125]:
df_clean['fav_genre'] = df_clean['fav_genre'].map(genre_mapping)

In [126]:
df_clean.head()

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Latin],Frequency [Lofi],...,Frequency [Video game music],Anxiety,Depression,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,age
3,2,0,0,1,2,0,3,2,3,2,...,0,9.0,7.0,1,0,1,Calm/Relaxing,1,1,61.0
4,0,0,1,0,1,3,0,3,2,2,...,1,7.0,2.0,1,0,0,Soulful/Reflective,1,0,18.0
5,1,2,0,0,0,2,3,3,1,3,...,0,8.0,8.0,1,1,1,Calm/Relaxing,1,1,18.0
6,2,0,1,2,1,1,2,0,1,1,...,2,4.0,8.0,1,1,0,Diverse/Varied,1,1,18.0
7,0,0,1,0,0,3,1,3,0,2,...,1,5.0,3.0,1,0,0,Diverse/Varied,1,1,21.0


Now let's actually one-hot encode this. Don't want to introduce any order here with label encoding.

In [129]:
df_encoded = pd.get_dummies(df_clean, columns=['fav_genre'])

In [130]:
df_encoded.head()

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Latin],Frequency [Lofi],...,while_working,instrumentalist,composer,exploratory,foreign_languages,age,fav_genre_Calm/Relaxing,fav_genre_Diverse/Varied,fav_genre_Energetic/Upbeat,fav_genre_Soulful/Reflective
3,2,0,0,1,2,0,3,2,3,2,...,1,0,1,1,1,61.0,1,0,0,0
4,0,0,1,0,1,3,0,3,2,2,...,1,0,0,1,0,18.0,0,0,0,1
5,1,2,0,0,0,2,3,3,1,3,...,1,1,1,1,1,18.0,1,0,0,0
6,2,0,1,2,1,1,2,0,1,1,...,1,1,0,1,1,18.0,0,1,0,0
7,0,0,1,0,0,3,1,3,0,2,...,1,0,0,1,1,21.0,0,1,0,0


In [138]:
#forgot to drop this column since we aren't considering Latin music
df_encoded.drop(columns = ['Frequency [Latin]'], axis = 1, inplace = True)

Apparently there is a NaN value, as you can see here. Let's deal with that.

In [151]:
df_encoded.isna().sum()

Frequency [Classical]           0
Frequency [Country]             0
Frequency [EDM]                 0
Frequency [Folk]                0
Frequency [Gospel]              0
Frequency [Hip hop]             0
Frequency [Jazz]                0
Frequency [K pop]               0
Frequency [Lofi]                0
Frequency [Metal]               0
Frequency [Pop]                 0
Frequency [R&B]                 0
Frequency [Rap]                 0
Frequency [Rock]                0
Frequency [Video game music]    0
Anxiety                         0
Depression                      0
while_working                   0
instrumentalist                 0
composer                        0
exploratory                     0
foreign_languages               0
age                             1
fav_genre_Calm/Relaxing         0
fav_genre_Diverse/Varied        0
fav_genre_Energetic/Upbeat      0
fav_genre_Soulful/Reflective    0
dtype: int64

In [157]:
df_encoded = df_encoded[df_encoded['age'].notna()]

## Random Forest

Now let's model this.

In [158]:
y = df_encoded.iloc[:, :15]
X = df_encoded.iloc[:, 15:]

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [171]:
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=50, random_state=42))

In [172]:
model.fit(X_train, y_train)

In [198]:
predictions = model.predict(X_test)

In [199]:
mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.3024777624617039


In [175]:
r2 = r2_score(y_test, predictions)
print(f"R2: {r2}")

R2: -0.0595512391751889


This is pretty bad, let's try XGBoost, it tends to do well. Also, let's run grid search since run times here are not long.

## XGBoost Regresser

In [182]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=50, random_state=42)
model = MultiOutputRegressor(xgb_regressor)

#model.fit(X_train, y_train)
#predictions = model.predict(X_test)
#r2 = r2_score(y_test, predictions)
#print(f"R-squared (R2) Score: {r2}")

R-squared (R2) Score: -0.27968907607174476


In [196]:
param_grid = {
    'estimator__n_estimators': [100, 200, 300],         # Number of trees
    #'estimator__max_depth': [3, 6, 9, 12],             # Maximum depth of each tree
    'estimator__learning_rate': [0.001, 0.01, 0.1],      # Learning rate
    #'estimator__subsample': [0.6, 0.8, 1.0],           # Subsample ratio
    'estimator__min_child_weight': [1, 3, 5]           # Minimum sum of instance weight (hessian) needed in a child
}

In [197]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

In [201]:
best_model = grid_search.best_estimator_

predictions = best_model.predict(X_test)

In [202]:
r2 = r2_score(y_test, predictions)
print(f"Best Model - R-squared (R2) Score: {r2}")

Best Model - R-squared (R2) Score: 0.032755886130277974


In [191]:
print("Best Hyperparameters:")
print(grid_search.best_params_)

Best Hyperparameters:
{'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 200}


These aren't good. The dataset is too small with too many things to predict. We need to group some things to make this work.

In [256]:
column_lst = list(df_encoded.columns) + ['helpful_genre']

df2 = pd.DataFrame(columns = column_lst)
df2.head()

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Lofi],Frequency [Metal],...,instrumentalist,composer,exploratory,foreign_languages,age,fav_genre_Calm/Relaxing,fav_genre_Diverse/Varied,fav_genre_Energetic/Upbeat,fav_genre_Soulful/Reflective,helpful_genre


Remember this mapping:

genre_mapping = {
    'Jazz': 'Calm/Relaxing',
    'R&B': 'Soulful/Reflective',
    'Video game music': 'Diverse/Varied',
    'K pop': 'Diverse/Varied',
    'Rock': 'Energetic/Upbeat',
    'EDM': 'Energetic/Upbeat',
    'Hip hop': 'Energetic/Upbeat',
    'Country': 'Soulful/Reflective',
    'Rap': 'Energetic/Upbeat',
    'Pop': 'Energetic/Upbeat',
    'Metal': 'Energetic/Upbeat',
    'Classical': 'Calm/Relaxing',
    'Folk': 'Soulful/Reflective',
    'Lofi': 'Calm/Relaxing',
    'Gospel': 'Soulful/Reflective'
}

In [224]:
reversed_genre_map = {}
for key, value in genre_mapping.items():
    if value not in reversed_genre_map:
        reversed_genre_map[value] = [key]
    else:
        reversed_genre_map[value].append(key)

# Print the reversed dictionary
print(reversed_genre_map)

{'Calm/Relaxing': ['Jazz', 'Classical', 'Lofi'], 'Soulful/Reflective': ['R&B', 'Country', 'Folk', 'Gospel'], 'Diverse/Varied': ['Video game music', 'K pop'], 'Energetic/Upbeat': ['Rock', 'EDM', 'Hip hop', 'Rap', 'Pop', 'Metal']}


In [233]:
#fixing the names here manually
reversed_genre_map = {'Calm/Relaxing': ['Frequency [Jazz]', 'Frequency [Classical]', 'Frequency [Lofi]'], 
                      'Soulful/Reflective': ['Frequency [R&B]', 'Frequency [Country]', 
                                             'Frequency [Folk]', 'Frequency [Gospel]'], 
                      'Diverse/Varied': ['Frequency [Video game music]', 'Frequency [K pop]'], 
                      'Energetic/Upbeat': ['Frequency [Rock]', 'Frequency [EDM]', 'Frequency [Hip hop]', 
                                           'Frequency [Rap]', 'Frequency [Pop]', 'Frequency [Metal]']}

In [257]:
for index, row in df_encoded.iterrows():
    for key in reversed_genre_map.keys():
        genre_group = list(reversed_genre_map[key])
        
        values_of_genre = row[genre_group].values
        if 3 in values_of_genre:
            row['helpful_genre'] = key
            row_df = pd.DataFrame(row).T
            df2 = pd.concat([df2, row_df])
            #print(row)

    #break
    
df2

Unnamed: 0,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],Frequency [Lofi],Frequency [Metal],...,instrumentalist,composer,exploratory,foreign_languages,age,fav_genre_Calm/Relaxing,fav_genre_Diverse/Varied,fav_genre_Energetic/Upbeat,fav_genre_Soulful/Reflective,helpful_genre
3,2.0,0.0,0.0,1.0,2.0,0.0,3.0,2.0,2.0,0.0,...,0.0,1.0,1.0,1.0,61.0,1.0,0.0,0.0,0.0,Calm/Relaxing
4,0.0,0.0,1.0,0.0,1.0,3.0,0.0,3.0,2.0,0.0,...,0.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,1.0,Soulful/Reflective
4,0.0,0.0,1.0,0.0,1.0,3.0,0.0,3.0,2.0,0.0,...,0.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,1.0,Diverse/Varied
4,0.0,0.0,1.0,0.0,1.0,3.0,0.0,3.0,2.0,0.0,...,0.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,1.0,Energetic/Upbeat
5,1.0,2.0,0.0,0.0,0.0,2.0,3.0,3.0,3.0,1.0,...,1.0,1.0,1.0,1.0,18.0,1.0,0.0,0.0,0.0,Calm/Relaxing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733,1.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,2.0,1.0,...,0.0,1.0,1.0,0.0,19.0,0.0,0.0,1.0,0.0,Energetic/Upbeat
734,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,19.0,1.0,0.0,0.0,0.0,Calm/Relaxing
735,2.0,1.0,3.0,2.0,1.0,3.0,3.0,2.0,1.0,0.0,...,0.0,0.0,1.0,1.0,29.0,0.0,0.0,1.0,0.0,Calm/Relaxing
735,2.0,1.0,3.0,2.0,1.0,3.0,3.0,2.0,1.0,0.0,...,0.0,0.0,1.0,1.0,29.0,0.0,0.0,1.0,0.0,Soulful/Reflective


It will be easier to just predict the following groups of music. We will need to drop some columns now though, particulary the one about favorite genre since that seems to obvious to ask for, and we don't want to rely on that either.

In [261]:
Counter(df2['helpful_genre'])

Counter({'Calm/Relaxing': 152,
         'Soulful/Reflective': 178,
         'Diverse/Varied': 135,
         'Energetic/Upbeat': 432})

In [262]:
df2.columns

Index(['Frequency [Classical]', 'Frequency [Country]', 'Frequency [EDM]',
       'Frequency [Folk]', 'Frequency [Gospel]', 'Frequency [Hip hop]',
       'Frequency [Jazz]', 'Frequency [K pop]', 'Frequency [Lofi]',
       'Frequency [Metal]', 'Frequency [Pop]', 'Frequency [R&B]',
       'Frequency [Rap]', 'Frequency [Rock]', 'Frequency [Video game music]',
       'Anxiety', 'Depression', 'while_working', 'instrumentalist', 'composer',
       'exploratory', 'foreign_languages', 'age', 'fav_genre_Calm/Relaxing',
       'fav_genre_Diverse/Varied', 'fav_genre_Energetic/Upbeat',
       'fav_genre_Soulful/Reflective', 'helpful_genre'],
      dtype='object')

So we will want to predict the music genre group from some questions about the user and also their depression and anxiety levels.

In [264]:
X = df2[['Anxiety', 'Depression', 'while_working', 'instrumentalist', 'composer',
       'exploratory', 'foreign_languages', 'age']]
y = df2['helpful_genre']

## Modeling Version 2

Now let's model this dataset.

In [312]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

X = X.apply(pd.to_numeric, errors = 'coerce')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [313]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [314]:
model = RandomForestRegressor(n_estimators=50, random_state=42)

model.fit(X_train, y_train)

In [315]:
y_pred = model.predict(X_test).astype(int)

accuracy_score(y_pred, y_test)

0.29444444444444445

## XGBoost Classifier

This is no good, let's try XGBoost.

In [369]:
param_grid = {
    'n_estimators': [200, 250],         # Number of trees
    'max_depth': [2, 3],             # Maximum depth of each tree
    'learning_rate': [0.01, 0.1],      # Learning rate
    #'subsample': [0.7, 0.8, 0.9],           # Subsample ratio
    #'gamma': [0, 0.1, 0.2],
    #'min_child_weight': [4, 5, 6]           # Minimum sum of instance weight (hessian) needed in a child
}

In [370]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=4, seed=42, tree_method='gpu_hist', max_depth = 2)
#xgb_clf.fit(X_train_numeric, y_train)


In [371]:
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

In [372]:
best_model = grid_search.best_estimator_
print(best_model)

predictions = best_model.predict(X_test)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=250, n_jobs=None, num_class=4,
              num_parallel_tree=None, objective='multi:softmax', ...)


In [373]:
from sklearn.metrics import f1_score, balanced_accuracy_score

f1_score(predictions, y_test, average = 'weighted')
balanced_accuracy_score(predictions, y_test)

0.17884615384615385

In [374]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.15      0.07      0.09        30
           1       0.00      0.00      0.00        29
           2       0.46      0.86      0.60        84
           3       0.10      0.03      0.04        37

    accuracy                           0.42       180
   macro avg       0.18      0.24      0.18       180
weighted avg       0.26      0.42      0.30       180



In [375]:
#y_pred = xgb_clf.predict(X_test)

In [378]:
y_pred_original = label_encoder.inverse_transform(predictions)
#y_pred_original