# Import libraries

In [39]:
import pandas as pd

# Data preprocessing

## Code to cast,remove duplicates and append data

### Read the data from the csv files

In [40]:
data1 = pd.read_csv("data/data_170k.csv")
data2 = pd.read_csv("data/data_114k.csv")
data3 = pd.read_csv("data/data_169k.csv")

print(data1.shape)
print(data2.shape)
print(data3.shape)

(170653, 19)
(114000, 19)
(169909, 19)


### Concatenate data1, data2 and data3

In [41]:
# Concatenate data1, data2 and data3
data_appended = pd.concat([data1, data2, data3])
print(data_appended.shape)

(454562, 19)


### Cast the 'explicit' column to a 1 and 0 value

In [42]:
# Changes all true and false value to 1 and 0 value
data_appended['explicit'] = data_appended['explicit'].apply(lambda x: 1 if x == True else 0)
# Print the all the type of value of the column 'explicit'
print(data_appended['explicit'].unique())

[0 1]


### Remove duplicates

### Remove duplicates based on the 'id' column

In [43]:
data_appended.drop_duplicates(subset=['id'], keep='first', inplace=True)
print(data_appended.shape)

(271839, 19)


### Remove rows with NaN values in the 'artists' column

In [44]:
data_appended = data_appended[~data_appended['artists'].apply(lambda x: isinstance(x, float))]
print(data_appended.shape)

(271837, 19)


### Cast the 'artists' column to a list of name(s)

In [45]:
# Iterate through the 'artists' column and ensure the format is ['artist1', 'artist2', ...]
def format_artists(artists):
    if isinstance(artists, str) and not artists.startswith('['):
        # Split the string by ';' and cast in the format ['artist1', 'artist2', ...]
        return [artist.strip() for artist in artists.split(';')]
    else:
        return artists

# Print some exemple of the 'artists' column before the cast
print("Before Cast:")
print(data_appended.iloc[0].to_dict()['artists'])
print(data_appended.iloc[107281].to_dict()['artists'])
print(data_appended.iloc[198057].to_dict()['artists'])
print(data_appended.iloc[212638].to_dict()['artists'])

data_appended['artists'] = data_appended['artists'].apply(format_artists)

# Print the same exemple of the 'artists' column after the cast
print("\nAfter Cast:")
print(data_appended.iloc[0].to_dict()['artists'])
print(data_appended.iloc[107281].to_dict()['artists'])
print(data_appended.iloc[198057].to_dict()['artists'])
print(data_appended.iloc[212638].to_dict()['artists'])

Before Cast:
['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']
['Ghost']
Unodavid
Wisin & Yandel;Chris Brown;T-Pain

After Cast:
['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']
['Ghost']
['Unodavid']
['Wisin & Yandel', 'Chris Brown', 'T-Pain']


### Append the data to a list

In [46]:
data_appended.to_csv("data/data_appended.csv", index=False)

# Create the list of dictionaries

In [47]:
music_list = []
for index, row in data_appended.iterrows():
    features_info = row.to_dict()
    music_list.append(features_info)

print(music_list[:5])

[{'valence': 0.0594, 'year': 1921, 'acousticness': 0.982, 'artists': "['Sergei Rachmaninoff', 'James Levine', 'Berliner Philharmoniker']", 'danceability': 0.279, 'duration_ms': 831667.0, 'energy': 0.211, 'explicit': 0, 'id': '4BJqT0PrAfrxzMOxytFOIz', 'instrumentalness': 0.878, 'key': 10.0, 'liveness': 0.665, 'loudness': -20.096, 'mode': 1.0, 'name': 'Piano Concerto No. 3 in D Minor, Op. 30: III. Finale. Alla breve', 'popularity': 4.0, 'speechiness': 0.0366, 'tempo': 80.954, 'genre': 'unknown'}, {'valence': 0.963, 'year': 1921, 'acousticness': 0.732, 'artists': "['Dennis Day']", 'danceability': 0.8190000000000001, 'duration_ms': 180533.0, 'energy': 0.341, 'explicit': 0, 'id': '7xPhfUan2yNtyFG0cUWkt8', 'instrumentalness': 0.0, 'key': 7.0, 'liveness': 0.16, 'loudness': -12.441, 'mode': 1.0, 'name': 'Clancy Lowered the Boom', 'popularity': 5.0, 'speechiness': 0.415, 'tempo': 60.93600000000001, 'genre': 'unknown'}, {'valence': 0.0394, 'year': 1921, 'acousticness': 0.961, 'artists': "['KHP K