Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

# Step 1: Data Preprocessing
# Load the two CSV files

data = pd.read_csv("song_artist.csv")

Data Preprocessing

In [2]:
#create a new code popularity based on the weighted sum
weights = {
    'acousticness': 0.1,
    'danceability': 0.2,
    'energy': 0.3,
    'instrumentalness': 0.1,
    'liveness': 0.1,
    'speechiness': 0.2
}

In [3]:
data['popularity'] = (data['acousticness'] * weights['acousticness'] +
                      data['danceability'] * weights['danceability'] +
                      data['energy'] * weights['energy'] +
                      data['instrumentalness'] * weights['instrumentalness'] +
                      data['liveness'] * weights['liveness'] +
                      data['speechiness'] * weights['speechiness'])

# Normalize the popularity score to a range between 0 and 100 (optional)
data['popularity'] = (data['popularity'] - data['popularity'].min()) / (data['popularity'].max() - data['popularity'].min()) * 100

In [4]:
data.dtypes

id                           int64
name                        object
artists                     object
acousticness               float64
danceability               float64
energy                     float64
instrumentalness           float64
liveness                   float64
speechiness                float64
acousticness_artist        float64
danceability_artist        float64
energy_artist              float64
instrumentalness_artist    float64
liveness_artist            float64
speechiness_artist         float64
popularity                 float64
dtype: object

In [5]:
data.head()

Unnamed: 0,id,name,artists,acousticness,danceability,energy,instrumentalness,liveness,speechiness,acousticness_artist,danceability_artist,energy_artist,instrumentalness_artist,liveness_artist,speechiness_artist,popularity
0,0,death bed (coffee for your head) (feat. beabad...,Powfu,0.731,0.726,0.431,0.0,0.696,0.135,0.39325,0.8095,0.40875,6.2e-05,0.234088,0.131225,58.46273
1,1,THE SCOTTS,THE SCOTTS,0.233,0.716,0.537,0.0,0.157,0.0514,0.3075,0.7065,0.521,0.0,0.159,0.05205,46.535912
2,2,The Box,Roddy Ricch,0.104,0.896,0.586,0.0,0.79,0.0559,0.184209,0.784984,0.627953,1.3e-05,0.167566,0.229681,59.960492
3,3,ily (i love you baby) (feat. Emilee),Surf Mesa,0.0686,0.674,0.774,0.00188,0.393,0.0892,0.0432,0.680667,0.709333,0.000627,0.193,0.058133,56.750175
4,4,Supalonely,BENEE,0.305,0.863,0.631,3e-05,0.123,0.0534,0.345733,0.757444,0.574333,1e-05,0.145644,0.041956,54.670028


Data Split for training and testing

In [6]:
y = data['popularity']
X = data.drop(columns=['name', 'artists', 'id', 'popularity'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Training

In [7]:

# Step 2: Model Training
# Define and train a deep learning model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Linear activation for regression task
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X_test_scaled, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2ca54a44990>

Display 5 songs and get user feedback

In [8]:
sorted_data = data.sort_values(by='popularity', ascending=False)

# Display the top 5 songs with highest popularity
top_songs = sorted_data.head(5)
print("Top 5 Songs with Highest Popularity:")
for i, song in top_songs.iterrows():
    print(f"{i+1}. {song['name']} by {song['artists']}")

Top 5 Songs with Highest Popularity:
57589. The Improv Fairy Tale by Mitch Hedberg
55834. Pissed Off by Dane Cook
79913. The World on Fire by Jim Morrison
52090. Free Dog by Bill Burr
56160. You know who's a great lady? by John Mulaney


In [9]:
user_feedback = {}
for i, song in top_songs.iterrows():
    feedback = input(f"On a scale of 1 to 10, how much do you like '{song['name']}' by {song['artists']}? ")
    user_feedback[song['name']]=(float(feedback))
print(user_feedback)

{'The Improv Fairy Tale': 6.0, 'Pissed Off': 7.0, 'The World on Fire': 7.0, 'Free Dog': 5.0, "You know who's a great lady?": 6.0}


In [13]:
#dataset creation
selected_rows = []

# Iterate over the user feedback dictionary
for song_name, rating in user_feedback.items():
    # Retrieve rows from the main dataset where the song name matches
    rows = data[data['name'] == song_name]
    # Add the selected row (if found) to the list
    if not rows.empty:
        selected_rows.append(rows.iloc[0])                  # Assuming there's only one row for each song name

# Create a new DataFrame from the selected rows
feedback = pd.DataFrame(selected_rows)                       # this is the array of inputs

# Drop irrelevant columns
feedback = feedback.drop(columns=['name', 'artists', 'id', 'popularity'])
feedback = pd.DataFrame(feedback.mean())                                    #take the mean of values obtained
feedback

Unnamed: 0,0
acousticness,0.8882
danceability,0.4982
energy,0.9178
instrumentalness,0.147401
liveness,0.9578
speechiness,0.8862
acousticness_artist,0.800256
danceability_artist,0.539555
energy_artist,0.764855
instrumentalness_artist,0.02068


In [27]:
print(len(X_train.columns))
len(feedback)

12


TypeError: object of type 'numpy.float64' has no len()