In [41]:
import seaborn as st 
import pandas as pd
import chromadb

def refresh():
    return pd.read_csv('top_50_2023.csv')
df = refresh()
df

Unnamed: 0,artist_name,track_name,is_explicit,album_release_date,genres,danceability,valence,energy,loudness,acousticness,instrumentalness,liveness,speechiness,key,tempo,mode,duration_ms,time_signature,popularity
0,Miley Cyrus,Flowers,False,2023-08-18,['pop'],0.706,0.632,0.691,-4.775,0.0584,7e-05,0.0232,0.0633,0,118.048,1,200600,4,94
1,SZA,Kill Bill,False,2022-12-08,"['pop', 'r&b', 'rap']",0.644,0.418,0.735,-5.747,0.0521,0.144,0.161,0.0391,8,88.98,1,153947,4,86
2,Harry Styles,As It Was,False,2022-05-20,['pop'],0.52,0.662,0.731,-5.338,0.342,0.00101,0.311,0.0557,6,173.93,0,167303,4,95
3,Jung Kook,Seven (feat. Latto) (Explicit Ver.),True,2023-11-03,['k-pop'],0.79,0.872,0.831,-4.185,0.312,0.0,0.0797,0.044,11,124.987,1,183551,4,90
4,Eslabon Armado,Ella Baila Sola,False,2023-04-28,"['corrido', 'corridos tumbados', 'sad sierreno...",0.668,0.834,0.758,-5.176,0.483,1.9e-05,0.0837,0.0332,5,147.989,0,165671,3,86
5,Taylor Swift,Cruel Summer,False,2019-08-23,['pop'],0.552,0.564,0.702,-5.707,0.117,2.1e-05,0.105,0.157,9,169.994,1,178427,4,99
6,Metro Boomin,Creepin' (with The Weeknd & 21 Savage),True,2022-12-02,['rap'],0.715,0.172,0.62,-6.005,0.417,0.0,0.0822,0.0484,1,97.95,0,221520,4,91
7,Rema,Calm Down (with Selena Gomez),False,2023-04-27,"['afrobeats', 'nigerian pop']",0.799,0.811,0.802,-5.196,0.429,0.00128,0.171,0.0371,11,107.008,1,239318,4,90
8,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",False,2023-01-11,"['argentine hip hop', 'pop venezolano', 'trap ...",0.778,0.498,0.632,-5.6,0.274,0.0,0.0915,0.0493,2,122.104,0,218289,4,85
9,Taylor Swift,Anti-Hero,False,2022-10-21,['pop'],0.637,0.533,0.643,-6.571,0.13,2e-06,0.142,0.0519,4,97.008,1,200690,4,92


## Pop-Extraction
Let's filter the data such that we are only including pop and pop-adjacent genres. Since it is impossible for us to this manually, we will use the chromadb to filter the data for us, through semantic analysis.

In [42]:
client = chromadb.Client()
collection = client.create_collection('genres')
collection.add(
    documents=['pop', 'ballad', 'jazz', 'country', 'traditional'],
    ids=['1', '2', '3', '4', '5']
)

"Finished adding documents to collection"

UniqueConstraintError: Collection genres already exists

"['pop']"

In [85]:
# Let's use the collection to filter the data
import ast 

def get_genres(genre_string: str) -> list:
    return ast.literal_eval(genre_string)

def process_genres(entry):
    genres = get_genres(entry)
    
    def is_valid(genres):
        results = collection.query(query_texts=genres, n_results=1)['documents']
        valid = False
        for result in results:
            if result[0] == 'pop':
                valid = True
                
        return valid
    return is_valid(genres)

df['is_pop'] = df.genres.apply(process_genres)
df.head(15)

Unnamed: 0,artist_name,track_name,is_explicit,album_release_date,genres,danceability,valence,energy,loudness,acousticness,instrumentalness,liveness,speechiness,key,tempo,mode,duration_ms,time_signature,popularity,is_pop
0,Miley Cyrus,Flowers,False,2023-08-18,['pop'],0.706,0.632,0.691,-4.775,0.0584,7e-05,0.0232,0.0633,0,118.048,1,200600,4,94,True
1,SZA,Kill Bill,False,2022-12-08,"['pop', 'r&b', 'rap']",0.644,0.418,0.735,-5.747,0.0521,0.144,0.161,0.0391,8,88.98,1,153947,4,86,True
2,Harry Styles,As It Was,False,2022-05-20,['pop'],0.52,0.662,0.731,-5.338,0.342,0.00101,0.311,0.0557,6,173.93,0,167303,4,95,True
3,Jung Kook,Seven (feat. Latto) (Explicit Ver.),True,2023-11-03,['k-pop'],0.79,0.872,0.831,-4.185,0.312,0.0,0.0797,0.044,11,124.987,1,183551,4,90,True
4,Eslabon Armado,Ella Baila Sola,False,2023-04-28,"['corrido', 'corridos tumbados', 'sad sierreno...",0.668,0.834,0.758,-5.176,0.483,1.9e-05,0.0837,0.0332,5,147.989,0,165671,3,86,False
5,Taylor Swift,Cruel Summer,False,2019-08-23,['pop'],0.552,0.564,0.702,-5.707,0.117,2.1e-05,0.105,0.157,9,169.994,1,178427,4,99,True
6,Metro Boomin,Creepin' (with The Weeknd & 21 Savage),True,2022-12-02,['rap'],0.715,0.172,0.62,-6.005,0.417,0.0,0.0822,0.0484,1,97.95,0,221520,4,91,False
7,Rema,Calm Down (with Selena Gomez),False,2023-04-27,"['afrobeats', 'nigerian pop']",0.799,0.811,0.802,-5.196,0.429,0.00128,0.171,0.0371,11,107.008,1,239318,4,90,True
8,Bizarrap,"Shakira: Bzrp Music Sessions, Vol. 53",False,2023-01-11,"['argentine hip hop', 'pop venezolano', 'trap ...",0.778,0.498,0.632,-5.6,0.274,0.0,0.0915,0.0493,2,122.104,0,218289,4,85,True
9,Taylor Swift,Anti-Hero,False,2022-10-21,['pop'],0.637,0.533,0.643,-6.571,0.13,2e-06,0.142,0.0519,4,97.008,1,200690,4,92,True


In [11]:
# Let's create a correlation matrix using the non-discrete columns inside the original dataframe
non_discrete_columns =('Medu', 'Fedu', 'traveltime', 'studytime', 'failures','famrel', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'score')
nd_df = pd.DataFrame(df, columns=non_discrete_columns)
nd_df.corr()

Unnamed: 0,Medu,Fedu,traveltime,studytime,failures,famrel,goout,Dalc,Walc,health,absences,score
Medu,1.0,0.626971,-0.17144,0.069349,-0.240133,-0.003967,0.069967,0.020341,-0.051924,-0.051914,0.102713,0.226116
Fedu,0.626971,1.0,-0.157267,-0.004909,-0.253591,-0.001369,0.049061,0.002427,-0.017925,0.009615,0.026982,0.176404
traveltime,-0.17144,-0.157267,1.0,-0.103097,0.092865,-0.016533,0.028046,0.135853,0.132782,0.008754,-0.014265,-0.134732
studytime,0.069349,-0.004909,-0.103097,1.0,-0.175752,0.039454,-0.060629,-0.193763,-0.254491,-0.078549,-0.060321,0.139799
failures,-0.240133,-0.253591,0.092865,-0.175752,1.0,-0.04421,0.12098,0.135167,0.143935,0.068448,0.061946,-0.376513
famrel,-0.003967,-0.001369,-0.016533,0.039454,-0.04421,1.0,0.064404,-0.077706,-0.113279,0.094008,-0.044431,0.021418
goout,0.069967,0.049061,0.028046,-0.060629,0.12098,0.064404,1.0,0.266512,0.415407,-0.012968,0.046053,-0.150704
Dalc,0.020341,0.002427,0.135853,-0.193763,0.135167,-0.077706,0.266512,1.0,0.646612,0.07613,0.112675,-0.067616
Walc,-0.051924,-0.017925,0.132782,-0.254491,0.143935,-0.113279,0.415407,0.646612,1.0,0.09464,0.134889,-0.086102
health,-0.051914,0.009615,0.008754,-0.078549,0.068448,0.094008,-0.012968,0.07613,0.09464,1.0,-0.031783,-0.084251
