In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("spotify_final_dataset.csv")

In [3]:
data.head()

Unnamed: 0,Position,Artist Name,Song Name,Days,Top 10 (xTimes),Peak Position,Peak Position (xTimes),Peak Streams,Total Streams
0,1,Post Malone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738
1,2,Juice WRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399
2,3,Lil Uzi Vert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024
3,4,J. Cole,No Role Modelz,2547,6.0,7,0,659366,734857487
4,5,Post Malone,rockstar,1223,186.0,1,(x124),2905678,718865961


In [4]:
# loading the hot 100 songs dataset
hot_100_songs=pd.read_csv('hot_100_songs.csv')

In [5]:
hot_100_songs.head()

Unnamed: 0.1,Unnamed: 0,Artist Name,Title of Song
0,0,Brenda Lee,Rockin' Around The Christmas Tree
1,1,Mariah Carey,All I Want For Christmas Is You
2,2,Bobby Helms,Jingle Bell Rock
3,3,Wham!,Last Christmas
4,4,Burl Ives,A Holly Jolly Christmas


In [6]:
# dropping the unnamed:0 column
hot_100_songs.drop(columns='Unnamed: 0', inplace=True)

In [7]:
# Selecting the two columns needed for our analysis
data=data[["Artist Name","Song Name"]]

In [8]:
# renaming the "Song Name" column to ensure uniformity in both datasets
data = data.rename(columns={'Song Name': 'Title of Song'})

In [9]:
# dropping all null values in our dataset
data.dropna()

Unnamed: 0,Artist Name,Title of Song
0,Post Malone,Sunflower SpiderMan: Into the SpiderVerse
1,Juice WRLD,Lucid Dreams
2,Lil Uzi Vert,XO TOUR Llif3
3,J. Cole,No Role Modelz
4,Post Malone,rockstar
...,...,...
11079,The Band Perry,If I Die Young
11080,Justin Timberlake,Not a Bad Thing
11081,Mike WiLL Made,It 23
11082,The Vamps,Somebody To You


In [10]:
def duplicate_remover(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function takes a pd dataframe, makes a copy, and checks if there are absolute
    duplicate values in each row. If found, it drops the row and saves the change in the new dataframe, else it doesn't do
    anything and returns the cleaned dataframe.
    """
    df2 = df.copy()
    df2 = df2.drop_duplicates()
    return df2

data=duplicate_remover(data)
data

Unnamed: 0,Artist Name,Title of Song
0,Post Malone,Sunflower SpiderMan: Into the SpiderVerse
1,Juice WRLD,Lucid Dreams
2,Lil Uzi Vert,XO TOUR Llif3
3,J. Cole,No Role Modelz
4,Post Malone,rockstar
...,...,...
11079,The Band Perry,If I Die Young
11080,Justin Timberlake,Not a Bad Thing
11081,Mike WiLL Made,It 23
11082,The Vamps,Somebody To You


In [11]:
data['Artists_lower'] = data['Artist Name'].str.lower()
data['Title_lower'] = data['Title of Song'].str.lower()
hot_100_songs['Artists_lower'] = hot_100_songs['Artist Name'].str.lower()
hot_100_songs['Title_lower'] = hot_100_songs['Title of Song'].str.lower()

In [12]:
# filtering our data to ensure that the any song from hot_100_songs is not contained in the spotify dataset

filtered_data = data[~((data['Artists_lower'].isin(hot_100_songs['Artists_lower'])) & (data['Title_lower'].isin(hot_100_songs['Title_lower'])))]

filtered_data = filtered_data.drop(['Artists_lower', 'Title_lower'], axis=1).reset_index(drop=True)
filtered_data

Unnamed: 0,Artist Name,Title of Song
0,Post Malone,Sunflower SpiderMan: Into the SpiderVerse
1,Juice WRLD,Lucid Dreams
2,Lil Uzi Vert,XO TOUR Llif3
3,J. Cole,No Role Modelz
4,Post Malone,rockstar
...,...,...
10940,The Band Perry,If I Die Young
10941,Justin Timberlake,Not a Bad Thing
10942,Mike WiLL Made,It 23
10943,The Vamps,Somebody To You


In [15]:
# selecting a sample of 2500 songs from the dataset randomly
data=filtered_data.sample(2500)

In [16]:
# resetting the index
data=data.reset_index()

In [18]:
# saving our finalised data with 2500 "not hot songs"
data.to_csv("not_hot_songs.csv")