In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Reading data from a CSV file
data = pd.read_csv('../data.csv')

data["popularity"] = data["popularity"].apply(lambda pop: pop/100)
data["tempo"] = data["tempo"].apply(lambda tpo: tpo/100)
data["loudness"] = data["loudness"].apply(lambda ldn: ldn/40)
data["popularity"] = data["popularity"].fillna(data["popularity"].median())
data["genres"] = data["genres"].fillna("")
data["genres"] = data["genres"].str.split(".")

df = pd.DataFrame(data)

df = df.drop(columns=['songname','artist'])

# Define input features and target variables
features = df[['time', 'dayofweek', 'month', 'temp']]
targets = df.drop(columns=['time', 'dayofweek', 'month', 'temp', 'genres'])

classifier_target = df['genres']

mlb = MultiLabelBinarizer()
classifier_target = mlb.fit_transform(classifier_target)

In [2]:
from sklearn.preprocessing import LabelEncoder
from src.python_files.num_predict import train_gradient_boosting_regressor
from src.python_files.cat_predict import train_random_forest_classifier

model = train_random_forest_classifier(features, classifier_target)[0]

genres_arr = []
for item in df['genres']:
    genres_arr.append(item[0])
print(genres_arr)
le = LabelEncoder()
genres_encoded = le.fit_transform(genres_arr)
features.insert(4, 'genres', genres_encoded)

multi_output_gbr = train_gradient_boosting_regressor(features, targets)[0]

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'classic japanese jazz', 'future funk', 'future funk', 'future funk', 'chill r&b', 'canadian hip hop', 'k-rap', 'k-rap', 'hip hop', 'indie r&b', 'alternative r&b', 'countrygaze', 'art pop', 'canadian hip hop', 'canadian hip hop', 'canadian hip hop', 'canadian hip hop', 'rap', 'rap', 'pop rap', 'rap', 'rap', 'rap', 'rap', 'rap', 'rap', 'rap', 'afrobeats', '', '', 'future funk', 'house', 'stutter house', 'asian american hip hop', '', '', 'meme', '', 'canadian hip hop', 'chill r&b', 'pop', 'indie rock', 'chill r&b', 'bedroom pop', 'bedroom pop', 'alternative pop rock', 'bedroom pop', '', 'deep new americana', '', 'indie pop', 'australian indie rock', 'uk alternative hip hop', 'focus beats', 'uk alternative hip hop', 'canadian contemporary r&b', 'bedroom pop', 'bedroom pop', 'indie hip hop', 'bedroom pop', 'indie pop', 'focus beats', 'indie pop', 'focus beats', 'uk alternative hip hop', 'indie pop', 'm

In [3]:
from src.python_files.record import get_weather_info, get_hour_info, get_weekday_info, get_month_info
import nest_asyncio
import asyncio

nest_asyncio.apply()

temp = float(asyncio.run(get_weather_info()))
time = float(get_hour_info())
day = float(get_weekday_info())
month = float(get_month_info())

In [4]:
import numpy as np
from rapidfuzz import fuzz

# time, dow, month, temp
cat_predict_value = pd.DataFrame([[time, day, month, temp]], columns=["time", "dayofweek", "month", "temp"])

class_predict = model.predict(cat_predict_value)

probabilities = model.predict_proba(cat_predict_value)

top_probs_idx = []
# idx, value
top_prob = [0, 0]

for prob_idx in range(len(probabilities)):
    if len(probabilities[prob_idx][0]) == 2 and probabilities[prob_idx][0][1] > .1:
        top_probs_idx.append(prob_idx)
        if top_prob[1] < probabilities[prob_idx][0][1]:
            top_prob[1] = probabilities[prob_idx][0][1]
            top_prob[0] = prob_idx
        

comparison_cats = []

for idx in top_probs_idx:
    print(mlb.classes_[idx])
    print(fuzz.ratio(mlb.classes_[top_prob[0]], mlb.classes_[idx]))
    if fuzz.token_set_ratio(mlb.classes_[top_prob[0]], mlb.classes_[idx]) > 50:
        comparison_cats.append(mlb.classes_[idx])
        
num_predict_cats = le.transform([comparison_cats[0]])
    
num_predict_value = pd.DataFrame([[time, day, month, temp, num_predict_cats]], columns=["time", "dayofweek", "month", "temp", "genres"])    

# Make predictions
y_pred = multi_output_gbr.predict(num_predict_value)
 
print(comparison_cats)
print(y_pred)
print((df.drop(columns=['time', 'dayofweek', 'month', 'temp', 'genres'])).keys())


 bedroom pop
19.047619047619047
5th wave emo
19.047619047619047
contemporary r&b
48.0
indie hip hop
54.54545454545454
indie pop
66.66666666666667
indie r&b
100.0
jersey club
30.000000000000004
lo-fi vgm
22.22222222222222
modern power pop
31.999999999999996
norwegian pop
27.27272727272727
pixel
28.57142857142857
pov: indie
52.63157894736843
r&b
50.0
rap
16.666666666666664
urban contemporary
22.22222222222222
vgm remix
22.22222222222222
['indie hip hop', 'indie pop', 'indie r&b', 'pov: indie', 'r&b']
[[ 5.03338936e-01  6.03946676e-01  4.59789130e-01 -2.50111908e-01
   7.02182072e-02  4.50195124e-01  5.45427630e-02  1.87382122e-01
   5.06613875e-01  1.10136780e+00  1.41021672e+05]]
Index(['popularity', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


In [5]:
dataset = pd.read_csv('../dataset.csv')

predicted_cols=['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

ignore_cols = ["duration_ms", "popularity", "loudness"]

prediction_weighting = [
    1, #popularity
    1, #danceability
    1.5, #energy
    0, #loudness
    1, #speechiness
    .9, #acousticness
    .9, #instrumentalness
    .1, #liveness
    1.5, #valence
    1.75, #tempo
    1 #duration ms
]

# data = dataset
# 
# dataset["popularity"] = dataset["popularity"].apply(lambda pop: pop/100)
# dataset["tempo"] = dataset["tempo"].apply(lambda tpo: tpo/100)
# dataset["loudness"] = dataset["loudness"].apply(lambda ldn: ldn/40)

song_similarity_dict = {}

for i in range(len(data["songname"])):
    if data["songname"][i] not in song_similarity_dict.keys():
        accuracy_score = 0
        intersection = False
        popularity_thresh = False
        for col_idx in range(len(predicted_cols)):
            if predicted_cols[col_idx] not in ignore_cols:
                # need to figure out how to do this for best results
                accuracy_score += abs(pow(y_pred[0][col_idx] - data[predicted_cols[col_idx]][i], 2)) * prediction_weighting[col_idx]
        for genre in data["genres"][i]:
            for comparison_genre in comparison_cats: 
                if fuzz.token_set_ratio(genre, comparison_genre) >= 50:
                    intersection = True      
        if not intersection:
            accuracy_score += .5
        if data["popularity"][i] > y_pred[0][predicted_cols.index("popularity")] - 10:
            popularity_thresh = True
        song_artists = data["artist"][i]
            
        song_similarity_dict[data["songname"][i]] = {"accuracy" : accuracy_score, "intersection" : intersection, "popularity" : popularity_thresh, "artists" : song_artists, "data_idx" : i}
    

In [6]:
drop_score = 1
playlist_length = 50

best_names = []
best_artists = []

for x in range(playlist_length):
    best_score = 10000
    best_song_name = ""
    
    for songname in song_similarity_dict.keys():
        
        song = song_similarity_dict[songname]

        if song["accuracy"] < best_score and songname not in best_names and song["popularity"] - 10:
            best_score = song["accuracy"]
            best_song_name = songname
            best_artists_name = song["artists"]
        
    if best_score >= drop_score:
        break
    
    print(best_song_name + " score: " + str(best_score) + " by " + song_similarity_dict[best_song_name]["artists"])

    best_names.append(best_song_name)
    best_artists.append(song_similarity_dict[best_song_name]["artists"])
    
print(comparison_cats)
print(best_names)
print(best_artists)

I've Been In Love score: 0.03218308946150075 by Jungle.Channel Tres
Underdressed at the Symphony score: 0.03658847676717733 by  Faye Webster
Worth It. score: 0.04113036701983322 by RAYE
Bruise score: 0.04636738060034423 by BETWEEN FRIENDS
That's Life score: 0.06591694573808218 by Still Woozy
Tu Corazón Es Mío... score: 0.07137216982999987 by Kali Uchis
People Watching score: 0.0846923315988788 by  Conan Gray
Never Felt So Alone score: 0.09609763254612674 by  Labrinth
cardigan score: 0.10122723151849722 by Taylor Swift
Smith & Westin score: 0.10376808404759885 by Masego.TeaMarrr
Done With You - Live score: 0.10546483961225617 by  Omar Apollo
Reckless & Sweet score: 0.10551421893196135 by Amaarae
Passionfruit score: 0.1094922628188529 by Drake
Window score: 0.1149732357076695 by Still Woozy
Dead Weight score: 0.12119428718653809 by Jack Stauber
Lose Control score: 0.13546924007807265 by Amaria
Tadow score: 0.13729185061371707 by Masego.FKJ
Reflections score: 0.13761588952832837 by  The N

In [7]:
import os
from dotenv import load_dotenv
from src.python_files.create_playlist import update_playlist

load_dotenv()

playlist_id = os.getenv('PLAYLIST_ID')
update_playlist(best_names, best_artists, playlist_id)

# update_playlist(best_names, best_artists)

In [8]:
from src.python_files.create_playlist import get_recs

get_recs(best_names[0], best_artists[0])

[{'album': {'album_type': 'ALBUM',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/59oA5WbbQvomJz2BuRG071'},
     'href': 'https://api.spotify.com/v1/artists/59oA5WbbQvomJz2BuRG071',
     'id': '59oA5WbbQvomJz2BuRG071',
     'name': 'Jungle',
     'type': 'artist',
     'uri': 'spotify:artist:59oA5WbbQvomJz2BuRG071'}],
   'available_markets': ['AR',
    'AU',
    'AT',
    'BE',
    'BO',
    'BR',
    'BG',
    'CA',
    'CL',
    'CO',
    'CR',
    'CY',
    'CZ',
    'DK',
    'DO',
    'DE',
    'EC',
    'EE',
    'SV',
    'FI',
    'FR',
    'GR',
    'GT',
    'HN',
    'HK',
    'HU',
    'IS',
    'IE',
    'IT',
    'LV',
    'LT',
    'LU',
    'MY',
    'MT',
    'MX',
    'NL',
    'NZ',
    'NI',
    'NO',
    'PA',
    'PY',
    'PE',
    'PH',
    'PL',
    'PT',
    'SG',
    'SK',
    'ES',
    'SE',
    'CH',
    'TW',
    'TR',
    'UY',
    'US',
    'GB',
    'AD',
    'LI',
    'MC',
    'ID',
    'TH',
    'VN',
    'RO',
    'IL'

In [9]:
from tabulate import tabulate

RED = "\033[91m"
GREEN = "\033[92m"
RESET = "\033[0m"

pred_act = [["track", "accuracy"], ["", "0"]]

for col_idx in range(len(predicted_cols)):
    if predicted_cols[col_idx] not in ignore_cols:
        pred_act[0].append(predicted_cols[col_idx])
        pred_act[1].append(str(round(y_pred[0][col_idx], 3)))
    
outlier_dist = .07
    
for name in best_names:
    r = song_similarity_dict[name]["data_idx"]
    temp_arr = [round(song_similarity_dict[name]["accuracy"], 3)]
    for c in predicted_cols:
        if c not in ignore_cols:
            cr_data = data[c][r]
            if abs(float(pred_act[1][len(temp_arr) + 1]) - cr_data) < outlier_dist:
                temp_arr.append(str(data[c][r]))
            elif float(pred_act[1][len(temp_arr) + 1]) - cr_data > outlier_dist:
                temp_arr.append(RED + str(data[c][r]) + RESET)
            else:
                temp_arr.append(GREEN + str(data[c][r]) + RESET)
    temp_arr.insert(0, name[:20])
    pred_act.append(temp_arr)
        
print(tabulate(pred_act, headers='firstrow', tablefmt='plain'))
        

track                   accuracy    danceability    energy    speechiness    acousticness    instrumentalness    liveness    valence    tempo
                           0               0.604     0.46          0.07            0.45              0.055         0.187       0.507  1.101
I've Been In Love          0.032           [92m0.679[0m     0.506         0.0671          0.421             0.00216       [92m0.434[0m       [92m0.588[0m  1.14967
Underdressed at the        0.037           0.598     [91m0.372[0m         0.0274          0.479             6.54e-06      [91m0.116[0m       [91m0.432[0m  [92m1.1802[0m
Worth It.                  0.041           [92m0.736[0m     0.483         0.0682          0.505             1.39e-05      [92m0.351[0m       [92m0.606[0m  1.09932
Bruise                     0.046           [92m0.687[0m     0.407         0.0941          [91m0.292[0m             0             [91m0.0845[0m      0.485  1.03478
That's Life                0.066  