In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [10]:
# Load the dataset

songs_data = pd.read_csv("../data/songs_data.csv")



In [4]:
songs_data.head ()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,playlist_name,playlist_id,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [11]:
# Drop unnecessary columns
columns_to_drop = [
    'track_name', 'track_artist', 'track_popularity', 
    'playlist_name', 'playlist_id', 'key', 'instrumentalness'
]
df_cleaned = songs_data.drop(columns=columns_to_drop)

In [12]:
# This section encodes the 'genre' column into numerical values using LabelEncoder
label_encoder = LabelEncoder()
df_cleaned['genre_encoded'] = label_encoder.fit_transform(df_cleaned['genre'])


In [13]:
# Drop the original 'genre' column
# This step removes the original 'genre' column after encoding
df_encoded = df_cleaned.drop(columns=['genre'])

In [14]:
# Define features and target variable
X = df_encoded.drop(columns=['track_id', 'genre_encoded'])
y = df_encoded['genre_encoded']

In [15]:
# Split the data into training and testing sets 80/20 subsets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Initialize and train the Random Forest classifier
# This section initializes the Random Forest model and trains it on the training data
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [17]:
# Predict on the test set
# This section uses the trained model to make predictions on the test data
y_pred = rf_classifier.predict(X_test)


In [18]:
# Evaluate the model's performance
# This section calculates and prints the accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)


In [19]:
print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

Model Accuracy: 0.541038525963149
Classification Report:
               precision    recall  f1-score   support

         edm       0.64      0.66      0.65      1218
       latin       0.50      0.41      0.45      1033
         pop       0.35      0.31      0.33      1081
         r&b       0.47      0.46      0.46      1031
         rap       0.56      0.63      0.59      1168
        rock       0.67      0.75      0.71      1036

    accuracy                           0.54      6567
   macro avg       0.53      0.54      0.53      6567
weighted avg       0.53      0.54      0.54      6567

