In [None]:
import pandas as pd
import datetime
import functions as f

In [None]:
url = 'top_spotify_songs_cleaned.csv'
df = pd.read_csv(url)

In [None]:
url2 = 'music_festivals_cleaned.csv'
df2 = pd.read_csv(url2)

In [None]:
#ID generator
f.generate_id(df,['album_name'],'album_id')
f.generate_id(df2,['age_group','visitor_demographics'],'demographic_id')
f.generate_id(df2,['country','currency'],'country_id')
f.generate_id(df,['country'],'country_id')
f.generate_id(df2,['genre'],'genre_id')
f.generate_id(df,['snapshot_date'],'date_id')

In [None]:
#DataFrame creations for table exporting

#generating region table
region_df = df2[['country_id','country','currency']].drop_duplicates()
country_id_mapping = region_df[['country_id', 'country']].drop_duplicates().set_index('country')['country_id'].to_dict()
df['country_id'] = df['country'].map(country_id_mapping)
df2['country_id'] = df2['country'].map(country_id_mapping)
#drop country_ids that are not present in music festivals dataframe
df = df.dropna()
df


In [None]:
df

In [None]:
df2

In [None]:
rank_1 = df[df['daily_rank'] == 1]
rank_1.head()

In [None]:
# Modify the lambda function in the aggregation to pass a tuple
most_common_artists_rank_1 = rank_1.groupby('country')['artists'].agg(lambda x: (x.mode().iloc[0],))

# Convert the aggregated data to a Series
most_common_artists_rank_1 = most_common_artists_rank_1.apply(lambda x: x[0])

In [None]:
most_common_artists_rank_1

In [None]:
# Separating the 'artists' column into new columns to see the artists involved in the song
df_artist = df['artists'].str.split(', ', expand=True)
df_artist.nunique()

In [None]:
df = pd.concat([df_artist, df], axis=1)
df.rename(columns = {0:'main_artist', 1:'feat_1', 2:'feat_2'}, inplace=True)
# We want to get rid of the remaining columns
df.drop(df.iloc[:, 3:10 ], axis=1, inplace=True)

In [None]:
df

In [None]:
top_artists

In [None]:
import matplotlib.pyplot as plt
# Number of Appearances 
# Count occurrences of "main_artist" and get the top 10
top_artists = df['main_artist'].value_counts().head(10)

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(top_artists.index[::-1], top_artists.values[::-1], color='#1DB954')

# Add text labels on the bars (rounded to thousands)
for bar in bars:
    rounded_value = f"{bar.get_width()/1000:.1f}k"  # Convert to '1.2k' format
    ax.text(bar.get_width() + max(top_artists.values) * 0.02,  # Offset for visibility
            bar.get_y() + bar.get_height() / 2,  
            rounded_value,  
            va='center', ha='left', color='white', fontsize=12)  

# Remove the x-axis and frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticks([])  # Hide x-axis ticks
ax.set_yticks(range(len(top_artists)))  # Keep y-axis labels for artists

# Style the plot
ax.set_ylabel("Artists", fontsize=12, color='white')
ax.set_title("Top 10 Popular Artists Streamed", fontsize=14, color='white')
ax.set_facecolor('#013220')  # Dark green background
fig.patch.set_facecolor('#013220')  # Match figure background
plt.yticks(color='white')
# plt.savefig('top_artists.png')
# Show the plot
plt.show()


In [None]:
top_tracks

In [None]:
import matplotlib.pyplot as plt

# Count occurrences of "name" (song titles) and get the top 10 tracks
top_tracks = df['name'].value_counts().head(10)

# Create a mapping of song names to their respective artists
track_artist_mapping = df.groupby('name')['artists'].first()

# Format y-axis labels with track on top and artist below
track_labels = [f'{track}\n({track_artist_mapping[track]})' for track in top_tracks.index]

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(track_labels[::-1], top_tracks.values[::-1], color='#1DB954')

# Add text labels on the bars (rounded to thousands)
for bar in bars:
    rounded_value = f"{bar.get_width()/1000:.1f}k"  # Convert to '1.2k' format
    ax.text(bar.get_width() + max(top_tracks.values) * 0.02,  # Offset for visibility
            bar.get_y() + bar.get_height() / 2,  
            rounded_value,  
            va='center', ha='left', color='white', fontsize=12)  

# Remove the x-axis and frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticks([])  # Hide x-axis ticks
ax.set_yticks(range(len(top_tracks)))  # Keep y-axis labels for songs

# Style the plot
ax.set_ylabel("Tracks", fontsize=12, color='white')
ax.set_title("Top 10 Popular Tracks Streamed", fontsize=14, color='white')
ax.set_facecolor('#013220')  # Dark green background
fig.patch.set_facecolor('#013220')  # Match figure background
plt.yticks(color='white')

# plt.savefig('top_tracks.png')
# Show the plot
plt.show()


In [None]:
# Assuming top_tracks contains the top 10 songs based on 'name'
top_tracks = rank_1['name'].value_counts().head(10)

# Create 'top_tracks_df' and drop duplicates
top_tracks_df = rank_1[rank_1['name'].isin(top_tracks.index)][['name', 'danceability', 'energy']].drop_duplicates()

# Show the new DataFrame
top_tracks_df


In [None]:
df_grouped = df.groupby('country', as_index=False).mean(numeric_only=True)
df_grouped.head()

In [None]:
import plotly.express as px

# Group by country and calculate the mean danceability
df_grouped = df.groupby('country', as_index=False).mean(numeric_only=True)

# Define a custom green color scale inspired by #1DB954
spotify_green_scale = [
    [0, "#98ff98"],   # Darker green for low values
    [0.5, "#1DB954"], # Spotify green for mid values
    [1, "#0a5c2f"]    # Lighter green for high values
]

# Create the choropleth map with a dark-themed map
fig = px.choropleth(
    df_grouped,
    locations='country',  # Column with full country names
    locationmode='country names',  # Use full country names
    color='danceability',  # Color based on danceability
    title='Danceability by Country',
    color_continuous_scale=spotify_green_scale,  # Apply custom Spotify green scale
    projection='natural earth'  # Use a natural Earth projection
)

# Adjust figure size, layout, and background color
fig.update_layout(
    width=800,  # Bigger width
    height=600,  # Bigger height
    title_font_size=18,
    geo=dict(
        bgcolor="#013220",  # Dark green background for the map itself
        showcoastlines=False,  # Hide coastlines for a cleaner look
        showland=True,  # Show land areas
        landcolor="#333333",  # Darker green for land
        showocean=True,  
        oceancolor="#011D13",  # Dark greenish ocean
        lakecolor="#011D13",  # Dark color for lakes as well
        projection_type="natural earth"
    ),
    paper_bgcolor='#013220',  # Dark green background for the figure
    font_color='white'  # Make text readable on dark background
)

fig.show()


In [None]:
import plotly.express as px

# Group by country and calculate the mean energy
df_grouped = df.groupby('country', as_index=False).mean(numeric_only=True)

# Define a custom green color scale inspired by #1DB954
spotify_green_scale = [
    [0, "#98ff98"],   # Darker green for low values
    [0.5, "#1DB954"], # Spotify green for mid values
    [1, "#0a5c2f"]    # Lighter green for high values
]

# Create the choropleth map with a dark-themed map
fig = px.choropleth(
    df_grouped,
    locations='country',  # Column with full country names
    locationmode='country names',  # Use full country names
    color='energy',  # Color based on energy instead of danceability
    title='Energy by Country',  # Update title to reflect energy
    color_continuous_scale=spotify_green_scale,  # Apply custom Spotify green scale
    projection='natural earth'  # Use a natural Earth projection
)

# Adjust figure size, layout, and background color
fig.update_layout(
    width=800,  # Bigger width
    height=600,  # Bigger height
    title_font_size=18,
    geo=dict(
        bgcolor="#013220",  # Dark green background for the map itself
        showcoastlines=False,  # Hide coastlines for a cleaner look
        showland=True,  # Show land areas
        landcolor="#333333",  # Darker green for land
        showocean=True,  
        oceancolor="#011D13",  # Dark greenish ocean
        lakecolor="#011D13",  # Dark color for lakes as well
        projection_type="natural earth"
    ),
    paper_bgcolor='#013220',  # Dark green background for the figure
    font_color='white'  # Make text readable on dark background
)

fig.show()


In [None]:
rank_1 = rank_1[["name", "danceability", "energy"]].drop_duplicates()
rank_1

In [None]:
rank_50 = df[["name", "danceability", "energy"]].drop_duplicates()
rank_50

In [None]:
df2

In [None]:
df2['avg_ecom_per_attendance'] = df2['economic_impact'] / df2['attendance']
df2['avg_ecom_per_attendance'] = df2['avg_ecom_per_attendance'].round(0)
df2['rank'] = df2['avg_ecom_per_attendance'].rank(ascending=False)
df2_sorted = df2.sort_values(by='rank', ascending=True)
df2_sorted

In [None]:
country_counts = df2['country'].value_counts()
country_counts

In [None]:
# Group by 'country' and aggregate the values
aggregated_df = df2_sorted.groupby('country').agg(
    festival_count=('country', 'size'),
    attendance=('attendance', 'sum'),
    economic_impact=('economic_impact', 'sum')
).reset_index()

# Calculate 'avg_ecom_per_attendance' and round to 0 decimal places
aggregated_df['avg_ecom_per_attendance'] = (aggregated_df['economic_impact'] / aggregated_df['attendance']).round(0)

# Rank by 'avg_ecom_per_attendance' from highest to lowest
aggregated_df['rank'] = aggregated_df['avg_ecom_per_attendance'].rank(ascending=False)

# Get the genre with the highest attendance per country
genre_df = df2_sorted.loc[df2_sorted.groupby('country')['attendance'].idxmax(), ['country', 'genre']]

# Merge with aggregated_df to include the genre column
aggregated_df = aggregated_df.merge(genre_df, on='country', how='left')

# Sort the DataFrame by rank
aggregated_df_sorted = aggregated_df.sort_values(by='rank', ascending=True)

# Display the result
aggregated_df_sorted

In [None]:
import matplotlib.pyplot as plt

# Filter for genres containing "Pop" or "Rock"
pop_rock_df = aggregated_df_sorted[aggregated_df_sorted['genre'].str.contains('Pop|Rock', case=False, na=False)]

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh(pop_rock_df['country'][::-1], pop_rock_df['economic_impact'][::-1], color='#1DB954')

# Add text labels on the bars (rounded to millions)
for bar in bars:
    rounded_value = f"{bar.get_width()/1e6:.1f}M"  # Convert to '1.2M' format
    ax.text(bar.get_width() + max(pop_rock_df['economic_impact']) * 0.02,  # Offset for visibility
            bar.get_y() + bar.get_height() / 2,  
            rounded_value,  
            va='center', ha='left', color='white', fontsize=12)  

# Remove the x-axis and frame (spines)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticks([])  # Hide x-axis ticks
ax.set_yticks(range(len(pop_rock_df)))  # Keep y-axis labels for countries

# Style the plot
ax.set_ylabel("Country", fontsize=12, color='white')
ax.set_title("Economic Impact of Pop and Rock Festivals by Country", fontsize=14, color='white')
ax.set_facecolor('#013220')  # Dark green background
fig.patch.set_facecolor('#013220')  # Match figure background
plt.yticks(color='white')

# Show the plot
plt.show()


In [None]:
aggregated_df_sorted = aggregated_df_sorted.sort_values(by="economic_impact", ascending=False)
aggregated_df_sorted

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Assuming df2_sorted is already sorted by economic_impact
# Get the top 10 countries with the highest total economic impact
top_countries = df2_sorted.groupby("country")["economic_impact"].sum().nlargest(10).index

# Filter data for only these countries
filtered_df = df2_sorted[df2_sorted["country"].isin(top_countries)]

# Set up the figure with 2 charts per row, making them large and clear
fig, axes = plt.subplots(5, 2, figsize=(16, 40))  # Bigger charts for clarity
fig.patch.set_facecolor("#011D13")  # Set the background color of the entire figure to #011D13

for ax, country in zip(axes.flatten(), top_countries):
    # Get economic impact data by genre
    country_data = filtered_df[filtered_df["country"] == country].groupby("genre")["economic_impact"].sum()
    
    # Rank genres by economic impact in descending order
    country_data_sorted = country_data.sort_values(ascending=False)
    
    # Apply #1DB954 to the genre with the largest impact, and use darker greens for others
    num_genres = len(country_data_sorted)
    green_palette = ["#1DB954"] + sns.dark_palette("#127033", n_colors=num_genres-1)  # Largest share gets #1DB954

    # Create the donut chart with the new colors
    wedges, texts, autotexts = ax.pie(
        country_data_sorted, 
        labels=["" if i >= 3 else label for i, label in enumerate(country_data_sorted.index)],  # Hide labels for the other genres
        autopct="%1.0f%%" if len(country_data_sorted) <= 3 else lambda pct: "" if pct < 1 else f"{pct:.0f}%",  # Only show % for top 3
        colors=green_palette, 
        startangle=140,
        wedgeprops={'edgecolor': 'white'},
        textprops={'fontsize': 16, 'color': 'white'}  # Set text color to white
    )
    
    # Make it a donut chart by adding a circle at the center with #011D13 background
    centre_circle = plt.Circle((0, 0), 0.70, fc="#011D13")  # Change inner circle color to #011D13
    ax.add_patch(centre_circle)
    
    # Calculate total economic impact and number of festivals for center text
    total_impact = country_data_sorted.sum()
    total_festivals = filtered_df[filtered_df["country"] == country]["festival_id"].nunique()

    # Determine the currency symbol based on the country
    if country == "United Kingdom":
        currency_symbol = "£"
    else:
        currency_symbol = "€"

    # Format total economic impact (convert to millions or thousands if necessary)
    if total_impact >= 1_000_000:
        total_impact_formatted = f"{currency_symbol}{total_impact / 1_000_000:.1f}M"
    elif total_impact >= 1_000:
        total_impact_formatted = f"{currency_symbol}{total_impact / 1_000:.1f}K"
    else:
        total_impact_formatted = f"{currency_symbol}{total_impact:.1f}"

    # Add text in the center (economic impact and festival count)
    ax.text(0, 0, f"{total_impact_formatted}\n{total_festivals} festivals", 
            horizontalalignment='center', verticalalignment='center', 
            fontsize=16, fontweight='bold', color='white')

    # Set simple country title
    ax.set_title(country, fontsize=18, fontweight="bold", color='white')
# plt.savefig('ecom_genre_per_country.png')
# Adjust layout and show
plt.tight_layout()
# plt.show()


In [None]:
most_common_artists_rank_1

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Assuming df2_sorted is already sorted by economic_impact
# Get the top 10 countries with the highest total economic impact
top_countries = df2_sorted.groupby("country")["economic_impact"].sum().nlargest(10).index

# Filter data for only these countries
filtered_df = df2_sorted[df2_sorted["country"].isin(top_countries)]

# Set up the figure with 2 charts per row, making them large and clear
fig, axes = plt.subplots(5, 2, figsize=(16, 40))  # Bigger charts for clarity
fig.patch.set_facecolor("#011D13")  # Set the background color of the entire figure to #011D13

for ax, country in zip(axes.flatten(), top_countries):
    # Get economic impact data by genre
    country_data = filtered_df[filtered_df["country"] == country].groupby("genre")["economic_impact"].sum()
    
    # Rank genres by economic impact in descending order
    country_data_sorted = country_data.sort_values(ascending=False)
    
    # Apply #1DB954 to the genre with the largest impact, and use darker greens for others
    num_genres = len(country_data_sorted)
    green_palette = ["#1DB954"] + sns.dark_palette("#127033", n_colors=num_genres-1)  # Largest share gets #1DB954

    # Create the donut chart with the new colors
    wedges, texts, autotexts = ax.pie(
        country_data_sorted, 
        labels=["" if i >= 3 else label for i, label in enumerate(country_data_sorted.index)],  # Hide labels for the other genres
        autopct="%1.0f%%" if len(country_data_sorted) <= 3 else lambda pct: "" if pct < 1 else f"{pct:.0f}%",  # Only show % for top 3
        colors=green_palette, 
        startangle=140,
        wedgeprops={'edgecolor': 'white'},
        textprops={'fontsize': 16, 'color': 'white'}  # Set text color to white
    )
    
    # Make it a donut chart by adding a circle at the center with #011D13 background
    centre_circle = plt.Circle((0, 0), 0.70, fc="#011D13")  # Change inner circle color to #011D13
    ax.add_patch(centre_circle)
    
    # Calculate total economic impact, total attendance, and number of festivals for center text
    total_impact = country_data_sorted.sum()
    total_festivals = filtered_df[filtered_df["country"] == country]["festival_id"].nunique()
    total_attendance = filtered_df[filtered_df["country"] == country]["attendance"].sum()  # Sum of attendance for the country

    # Determine the currency symbol based on the country
    if country == "United Kingdom":
        currency_symbol = "£"
    else:
        currency_symbol = "€"

    # Format total economic impact (convert to millions or thousands if necessary)
    if total_impact >= 1_000_000:
        total_impact_formatted = f"{currency_symbol}{total_impact / 1_000_000:.1f}M"
    elif total_impact >= 1_000:
        total_impact_formatted = f"{currency_symbol}{total_impact / 1_000:.1f}K"
    else:
        total_impact_formatted = f"{currency_symbol}{total_impact:.1f}"

    # Format total attendance (showing in thousands or millions if large)
    if total_attendance >= 1_000_000:
        total_attendance_formatted = f"{total_attendance / 1_000_000:.1f}M"
    elif total_attendance >= 1_000:
        total_attendance_formatted = f"{total_attendance / 1_000:.1f}K"
    else:
        total_attendance_formatted = f"{total_attendance}"

    # Add text in the center (economic impact, festival count, and attendance)
    ax.text(0, 0, f"{total_impact_formatted}\n{total_festivals} festivals\n{total_attendance_formatted} attendees", 
            horizontalalignment='center', verticalalignment='center', 
            fontsize=16, fontweight='bold', color='white')

    # Set simple country title
    ax.set_title(country, fontsize=18, fontweight="bold", color='white')

# Adjust layout and show
plt.tight_layout()
plt.show()
