In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedGroupKFold
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('itamplify_train.csv')

train, test = train_test_split(data, 
                                stratify=data['track_genre'],
                                test_size=0.2,
                                random_state=42)

train, val = train_test_split(train,
                                stratify=train['track_genre'],
                                test_size=0.2,
                                random_state=42)


In [3]:
print("---train---")
print(f"observaciones train: {train.size}")
print(train['track_genre'].value_counts(normalize=True))


---train---
observaciones train: 111069
samba          0.117792
techno         0.105124
psych-rock     0.103989
folk           0.092645
disco          0.083570
funk           0.076007
blues          0.072982
hip-hop        0.069200
heavy-metal    0.050482
reggaeton      0.044999
alternative    0.043297
afrobeat       0.041407
country        0.035356
jazz           0.032709
classical      0.030441
Name: track_genre, dtype: float64


In [4]:
print("---val---")
print(f"observaciones train: {val.size}")
print(val['track_genre'].value_counts(normalize=True))


---val---
observaciones train: 27783
samba          0.117914
techno         0.105064
psych-rock     0.103553
folk           0.092971
disco          0.083900
funk           0.076342
blues          0.072562
hip-hop        0.069539
heavy-metal    0.050642
reggaeton      0.045351
alternative    0.043084
afrobeat       0.040816
country        0.035525
jazz           0.032502
classical      0.030234
Name: track_genre, dtype: float64


In [5]:
print("---test---")
print(f"observaciones test: {test.size}")
print(test['track_genre'].value_counts(normalize=True))

---test---
observaciones test: 34734
samba          0.117896
techno         0.105200
psych-rock     0.103990
folk           0.092503
disco          0.084039
funk           0.076179
blues          0.072551
hip-hop        0.069528
heavy-metal    0.050786
reggaeton      0.044740
alternative    0.043531
afrobeat       0.041112
country        0.035067
jazz           0.032648
classical      0.030230
Name: track_genre, dtype: float64


In [6]:
def encode_genres(data):
    """
    Converts the 'track_genre' column to one-hot encoded format.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the 'track_genre' column.

    Returns:
    - pd.DataFrame: Modified DataFrame with 'track_genre' replaced by one-hot encoded columns.
    """
    # Create one-hot encoded variables
    genre_dummies = pd.get_dummies(data['track_genre'], prefix='genre')
    
    # Drop the original 'track_genre' column
    data = data.drop('track_genre', axis=1)
    
    # Concatenate the original DataFrame and the new one-hot encoded DataFrame
    data_encoded = pd.concat([data, genre_dummies], axis=1)
    
    return data_encoded

def drop_columns(data):
    """
    Drops columns that are not useful for the model.

    Parameters:
    - data (pd.DataFrame): DataFrame containing the columns to be dropped.

    Returns:
    - pd.DataFrame: Modified DataFrame with the columns dropped.
    """
    # Drop columns that are not useful for the model

    columns_to_drop = ['track_name', 'track_id', 'artists', 'album_name', 'X']
    data = data.drop(columns_to_drop, axis=1)
    
    return data

In [7]:
train = encode_genres(drop_columns(train))
val = encode_genres(drop_columns(val))
test = encode_genres(drop_columns(test))

In [9]:
data.columns

Index(['X', 'track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')