In [None]:
from src.python_files.manual_clustering import get_clustering_genres
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Reading data from a CSV file and standardizing it
data = pd.read_csv('../data.csv')[:-10]

data["popularity"] = data["popularity"].apply(lambda pop: pop/100)
data["tempo"] = data["tempo"].apply(lambda tpo: tpo/100)
data["loudness"] = data["loudness"].apply(lambda ldn: ldn/40)
data["popularity"] = data["popularity"].fillna(data["popularity"].median())
data["genres"] = data["genres"].fillna("")
data["genres"] = data["genres"].str.split(".")

df = pd.DataFrame(data)

df = df.drop(columns=['songname','artist', 'id'])

# Define input features and target variables
features = df[['time', 'dayofweek', 'month', 'temp']]
targets = df.drop(columns=['time', 'dayofweek', 'month', 'temp', 'genres'])

# spotify_genre_seeds = get_genres()['genres']

# gets labels of genres
spotify_genre_seeds = get_clustering_genres()

# updates genre's col to include only certain seed labels
for idx in range(len(data['genres'])):
    song_genres = data['genres'][idx]
    simplified_song_genres = []
    for song_genre in song_genres:
        for genre in spotify_genre_seeds:
            if genre in song_genre:
                simplified_song_genres.append(genre)
    data.at[idx, 'genres'] = simplified_song_genres

In [None]:
from src.python_files.num_predict import train_gradient_boosting_regressor
from src.python_files.cat_predict import train_random_forest_classifier

# turns genres into an array of 0s and 1s. 1 means the genre exists on that piece of data, 0 means it doesn't exist
classifier_target = data['genres']
mlb = MultiLabelBinarizer()
classifier_target = mlb.fit_transform(classifier_target)
classifier_target = pd.DataFrame(classifier_target, columns=mlb.classes_)

# trains a model based on features variable to find expected genre
model = train_random_forest_classifier(features, classifier_target)[0]

# trains a model based on features + genre seeds to predict song characteristics
features = pd.concat([features, classifier_target], axis=1)
multi_output_gbr = train_gradient_boosting_regressor(features, targets)[0]

In [None]:
from src.python_files.record import get_weather_info, get_hour_info, get_weekday_info, get_month_info
import nest_asyncio
import asyncio

nest_asyncio.apply()

# gets current information

temp = float(asyncio.run(get_weather_info()))
time = float(get_hour_info())
day = float(get_weekday_info())
month = float(get_month_info())

In [None]:
import numpy as np

# time, dow, month, temp
cat_predict_value = pd.DataFrame([[time, day, month, temp]], columns=["time", "dayofweek", "month", "temp"])

# predicts which genres are most likely given current info
class_predict = model.predict(cat_predict_value)

# gives the probabilities of each of these genres
probabilities = model.predict_proba(cat_predict_value)

genre_prediction = []
# idx, value
top_prob = [0, 0]

prob_cutoff = .125

for prob_idx in range(len(probabilities)):
    if len(probabilities[prob_idx][0]) == 2:
        if probabilities[prob_idx][0][1] > top_prob[1] and probabilities[prob_idx][0][1] > prob_cutoff:
            top_prob = [prob_idx, probabilities[prob_idx][0][1]]
        genre_prediction.append(1)
    else:
        genre_prediction.append(0)
                                    
genre_prediction = np.array(genre_prediction).reshape(1, -1)

genre_df = pd.DataFrame(genre_prediction, columns=mlb.classes_)


# uses the prediction genre along with other features to predict song characteristics
num_predict_value = pd.concat([pd.DataFrame([[time, day, month, temp]], columns=["time", "dayofweek", "month", "temp"]), genre_df], axis=1)
    
# Make predictions
y_pred = multi_output_gbr.predict(num_predict_value)

noise = np.random.normal(0, scale=0.1, size=y_pred.shape)  # Adds some noise in an attempt to make playlists more diversified
y_pred = y_pred + noise

genre_prediction_text = list(mlb.inverse_transform(genre_prediction)[0])
top_predicted_genre = genre_df.columns[top_prob[0]]

print(genre_prediction_text)


In [None]:
from src.python_files.manual_clustering import weight_genres, cluster_songs_by_genre
from sklearn.preprocessing import MinMaxScaler

# dataset of more songs to use
dataset = pd.read_csv('../dataset.csv')

# data = dataset
# 
# dataset["popularity"] = dataset["popularity"].apply(lambda pop: pop/100)
# dataset["tempo"] = dataset["tempo"].apply(lambda tpo: tpo/100)
# dataset["loudness"] = dataset["loudness"].apply(lambda ldn: ldn/40)


# columns of data important to predictions
predicted_cols=['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

cleaned_df = df.drop(['genres', 'temp', 'time', 'dayofweek', 'month', 'duration_ms', 'popularity'],axis=1)

# scales from 0-1
cleaned_df = MinMaxScaler().fit_transform(cleaned_df)

cleaned_df = pd.DataFrame(cleaned_df)

# Convert cleaned_df to NumPy array for faster operations
cleaned_np = np.array(cleaned_df)

# runs data clustering, creation of genre specific weights, gets a specific weight to be used given the top genre
genre_song_dict = cluster_songs_by_genre(data)

weighted_genres = weight_genres(genre_song_dict, cleaned_np, cleaned_df)

prediction_weighting = weighted_genres[top_predicted_genre]

song_similarity_dict = {}

def calc_similarity_ratings(data):
    """
    Calculates the similarity ratings for the given data.

    :param data: The data to calculate similarity ratings for.
    :return: The accuracy score, popularity threshold, and song artists.
    """
    accuracy_score = 0
    popularity_thresh = False
    for col_idx in range(len(predicted_cols)):
        # need to figure out how to do this for best results
        if (y_pred[0][col_idx] > .5 and data[predicted_cols[col_idx]][i] > .5) or (y_pred[0][col_idx] < .5 and data[predicted_cols[col_idx]][i] < .5):
            accuracy_score += abs(y_pred[0][col_idx] - data[predicted_cols[col_idx]][i]) * prediction_weighting[col_idx]
        else:
            accuracy_score += abs(pow(y_pred[0][col_idx] - data[predicted_cols[col_idx]][i], 2)) * prediction_weighting[col_idx]  
    if data["popularity"][i] > .5:
        popularity_thresh = True
    song_artists = data["artist"][i]
    
    return accuracy_score, popularity_thresh, song_artists

# makes a dict of songs and their similarity ratings
for i in range(len(data["songname"])):
    if data["songname"][i] not in song_similarity_dict.keys():
        accuracy_score, popularity_thresh, song_artists = calc_similarity_ratings(data)
        song_similarity_dict[data["songname"][i]] = {"accuracy" : accuracy_score, "popularity" : popularity_thresh, "artists" : song_artists, "data_idx" : i}
    

In [None]:
def get_closest_song(drop_score):
    """
    :param drop_score: the minimum score threshold for dropping a song
    :return: a tuple containing the best song name and artist name, or "DROP" if the best score is below the drop_score

    This function iterates through a dictionary of song similarity scores and returns the closest song based on the accuracy score. 
    The drop_score is used to determine whether the best song should be dropped. If the best_score is below the drop_score, 
    the function returns "DROP", otherwise it returns the best_song_name and best_artists_name.
    """
    best_score = 10000
    best_song_name = ""
    best_artists_name = ""
    
    for songname in song_similarity_dict.keys():
        
        song = song_similarity_dict[songname]
        
        if song["accuracy"] < best_score and songname not in best_names and song["popularity"]:
            best_score = song["accuracy"]
            best_song_name = songname
            best_artists_name = song["artists"]
        
    if best_score >= drop_score:
        return "DROP", "DROP"
    else:
        return best_song_name, best_artists_name
    

In [None]:
# sets the length of the playlist and how bad data has to get to not fill up to that point
playlist_length = 50
drop_score = 1

best_names = []
best_artists = []

top_song, top_artist = get_closest_song(drop_score)    

# gets the best songs according to the predictions and adds them to an arr
for x in range(playlist_length):
    best_song_name, best_artists_name = get_closest_song(drop_score)
    
    if best_song_name == "DROP":
        break
    
    best_names.append(best_song_name)
    best_artists.append(song_similarity_dict[best_song_name]["artists"])
    
print(best_names)

In [None]:
import os
from dotenv import load_dotenv
from src.python_files.spotify_helper import update_playlist

load_dotenv()

# uploads the song to spotify

playlist_id = os.getenv('PLAYLIST_ID')
update_playlist(best_names, best_artists, playlist_id)

# update_playlist(best_names, best_artists)

In [None]:
from tabulate import tabulate

# displays the data in a format easily readable

RED = "\033[91m"
GREEN = "\033[92m"
RESET = "\033[0m"

pred_act = [["track", "accuracy"], ["", "0"]]

for col_idx in range(len(predicted_cols)):
    pred_act[0].append(predicted_cols[col_idx])
    pred_act[1].append(str(round(y_pred[0][col_idx], 3)))
    
outlier_dist = .07
    
for name in best_names:
    r = song_similarity_dict[name]["data_idx"]
    temp_arr = [round(song_similarity_dict[name]["accuracy"], 3)]
    for c in predicted_cols:
        cr_data = data[c][r]
        if abs(float(pred_act[1][len(temp_arr) + 1]) - cr_data) < outlier_dist:
            temp_arr.append(str(data[c][r]))
        elif float(pred_act[1][len(temp_arr) + 1]) - cr_data > outlier_dist:
            temp_arr.append(RED + str(data[c][r]) + RESET)
        else:
            temp_arr.append(GREEN + str(data[c][r]) + RESET)
    temp_arr.insert(0, name[:20])
    pred_act.append(temp_arr)
        
print(tabulate(pred_act, headers='firstrow', tablefmt='plain')) 