# Import libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Data preprocessing

²### Read the data from the csv files

In [None]:
data1 = pd.read_csv("data/data_170k.csv")
data2 = pd.read_csv("data/data_114k.csv")
data3 = pd.read_csv("data/data_169k.csv")
print(data1.shape)
print(data2.shape)
print(data3.shape)

### Concatenate data1, data2 and data3

In [None]:
# Concatenate data1, data2 and data3
data_appended = pd.concat([data1, data2, data3])
print(data_appended.shape)

### Cast the 'explicit' column to a 1 and 0 value

In [None]:
# Changes all true and false value to 1 and 0 value
data_appended['explicit'] = data_appended['explicit'].apply(lambda x: 1 if x == True else 0)
# Print the all the type of value of the column 'explicit'
print(data_appended['explicit'].unique())

### Remove duplicates

### Remove duplicates based on the 'id' column

In [None]:
data_appended.drop_duplicates(subset=['id'], keep='first', inplace=True)
print(data_appended.shape)

### Remove rows with NaN values in the 'artists' column

In [None]:
data_appended = data_appended[~data_appended['artists'].apply(lambda x: isinstance(x, float))]
print(data_appended.shape)

### Change Genre to a list of genres

In [None]:
print(data_appended[['artists', 'genre']].head())

data_w_genres = pd.read_csv("data/data_w_genres.csv")

artist_genre = dict(zip(data_w_genres['artists'], data_w_genres['genres']))


def add_genre(row):
    genres = set()
    row['artists'] = row['artists'][2:-2]
    artists = row['artists'].split(', ')
    for artist in artists:
        if artist in artist_genre:
            genres.add(artist_genre[artist])
    return ', '.join(genres) if genres else []


data_appended['genre'] = data_appended.apply(add_genre, axis=1)

print(data_appended[['artists', 'genre']].head())

### Cast the 'artists' column to a list of name(s)

In [None]:
# Iterate through the 'artists' column and ensure the format is ['artist1', 'artist2', ...]
def format_artists(artists):
    if isinstance(artists, str) and not artists.startswith('['):
        # Split the string by ';' and cast in the format ['artist1', 'artist2', ...]
        return [artist.strip() for artist in artists.split(';')]
    else:
        return artists


# Print some exemple of the 'artists' column before the cast
print("Before Cast:")
print(data_appended.iloc[0].to_dict()['artists'])
print(data_appended.iloc[107281].to_dict()['artists'])
print(data_appended.iloc[198057].to_dict()['artists'])
print(data_appended.iloc[212638].to_dict()['artists'])

data_appended['artists'] = data_appended['artists'].apply(format_artists)

# Print the same exemple of the 'artists' column after the cast
print("\nAfter Cast:")
print(data_appended.iloc[0].to_dict()['artists'])
print(data_appended.iloc[107281].to_dict()['artists'])
print(data_appended.iloc[198057].to_dict()['artists'])
print(data_appended.iloc[212638].to_dict()['artists'])

### Calculate the number of different artists

In [None]:
artists = data_appended['artists'].explode().unique()
print(f"Number of different artists: {len(artists)}")

### Append the data to a list

In [None]:
data_appended.to_csv("data/data_appended.csv", index=False)

# Create the list of dictionaries

In [None]:
music_list = []
for index, row in data_appended.iterrows():
    features_info = row.to_dict()
    music_list.append(features_info)
print(music_list[:5])

## Visualize the data

In [None]:
# Artists with the most music per year
filtered_data = data_appended[data_appended['year'] != 0]
exploded_data = filtered_data.explode('artists')
artist_year_counts = exploded_data.groupby(['year', 'artists']).size().reset_index(name='Count')
top_artist_per_year = artist_year_counts.loc[artist_year_counts.groupby('year')['Count'].idxmax()]
top_artist_per_year['artist_year'] = top_artist_per_year.apply(lambda row: f"{row['artists']} ({row['year']})", axis=1)
plt.figure(figsize=(30, 10))
sns.barplot(data=top_artist_per_year, x='year', y='Count', hue='artist_year')
plt.title('Artist with the most music per year')
plt.xlabel('Year')
plt.ylabel('Number of musics')
plt.legend(title='Artistes', bbox_to_anchor=(0, -0.1), loc='upper left', ncol=4)
plt.xticks(rotation=90)

# Number of songs per year
filtered_data = data_appended[data_appended['year'] != 0]
music_count_df = filtered_data.groupby('year').size().reset_index(name='Count')
print("Total of musics with a valid year", music_count_df['Count'].sum())
plt.figure(figsize=(10, 5))
sns.lineplot(data=music_count_df, x='year', y='Count')
plt.title('Number of musics by valid year')
plt.xlabel('Year')
plt.ylabel('Number of musics')

#-- Subplot 1--
fig1, axes1 = plt.subplots(2, 2, figsize=(16, 8))

# Duration of the songs
sns.histplot(data_appended['duration_ms'] / 60000, bins=30, kde=True, ax=axes1[0, 0])
axes1[0, 0].set_title('Distribution of the duration of the musics')
axes1[0, 0].set_xlabel('Duration (minutes)')
axes1[0, 0].set_ylabel('Number of musics')

# Popularity of the songs
sns.histplot(data_appended['popularity'], bins=30, kde=True, ax=axes1[0, 1])
axes1[0, 1].set_title('Distribution of the popularity of the musics')
axes1[0, 1].set_xlabel('Popularity')
axes1[0, 1].set_ylabel('Number of musics')

# Energy of the songs
sns.histplot(data_appended['energy'], bins=30, kde=True, ax=axes1[1, 0])
axes1[1, 0].set_title('Distribution of the energy of the musics')
axes1[1, 0].set_xlabel('Energy')
axes1[1, 0].set_ylabel('Number of musics')

# Tempo of the songs
sns.histplot(data_appended['tempo'], bins=30, kde=True, ax=axes1[1, 1])
axes1[1, 1].set_title('Distribution of the tempo of the musics')
axes1[1, 1].set_xlabel('Tempo')
axes1[1, 1].set_ylabel('Number of musics')

plt.tight_layout()
#-- Subplot 1--

#-- Subplot 2--
fig2, axes2 = plt.subplots(1, 3, figsize=(16, 8))

# Artists with the most music
artist_counts = data_appended['artists'].explode().value_counts().head(10)
sns.barplot(x=artist_counts.values, y=artist_counts.index, ax=axes2[0])
axes2[0].set_title('Top 10 Artists with the most music')
axes2[0].set_xlabel('Number of musics')
axes2[0].set_ylabel('Artist')

# number of music by genre excluding empty []
filtered_data = data_appended[data_appended['genre'] != '[]']
genre_counts = filtered_data['genre'].explode().value_counts().head(10)
sns.barplot(x=genre_counts.values, y=genre_counts.index, ax=axes2[1])
axes2[1].set_title('Number of musics by genre')
axes2[1].set_xlabel('Number of musics')
axes2[1].set_ylabel('Genre')

# Explicit content
explicit_counts = data_appended['explicit'].value_counts()
sns.barplot(x=explicit_counts.index, y=explicit_counts.values, ax=axes2[2])
axes2[1].set_title('Distribution of explicit content')
axes2[1].set_xlabel('Explicit (True/False)')
axes2[1].set_ylabel('Number of musics')

plt.tight_layout()
#-- Subplot 2--

# Display the graph
plt.show()