Billboard Top 100 Data Analysis Project

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('billboard.csv')

# Inspect the dataset
print(data.head())
print(data.info())


Data Cleaning

In [None]:
print(data.isnull().sum())


data = data.dropna()

Exploratory Data Analysis 

In [None]:
print(data.describe())


Q1. Which artists appear most frequently on the Billboard Top 100?

In [None]:
top_artists = data['artist'].value_counts().head(10)
print("Top 10 Artists:")
print(top_artists)
plt.figure(figsize=(10, 6))
top_artists.plot(kind='bar', color='skyblue')
plt.title('Top 10 Artists by Number of Songs in Billboard Top 100')
plt.xlabel('Artist')
plt.ylabel('Number of Songs')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Q2. What are the trends in song length over the years?

In [None]:
data['year'] = pd.to_datetime(data['date']).dt.year
avg_song_length = data.groupby('year')['duration'].mean()
print("Average Song Length by Year:")
print(avg_song_length)
plt.figure(figsize=(10, 6))
plt.plot(avg_song_length, marker='o', linestyle='-', color='green')
plt.title('Average Song Length Over Years')
plt.xlabel('Year')
plt.ylabel('Average Song Length (seconds)')
plt.grid()
plt.tight_layout()
plt.show()

Q3. Which genres dominate the Billboard Top 100?

In [None]:
if 'genre' in data.columns:
    genre_distribution = data['genre'].value_counts()
    print("Genre Distribution:")
    print(genre_distribution)
    plt.figure(figsize=(10, 6))
    genre_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
    plt.title('Genre Distribution in Billboard Top 100')
    plt.ylabel('')
    plt.tight_layout()
    plt.show()
else:
    print("The dataset does not contain a 'genre' column.")


Q4. How does the tempo of songs correlate with their popularity?

In [None]:
if 'tempo' in data.columns and 'popularity' in data.columns:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='tempo', y='popularity', data=data, alpha=0.7, color='purple')
    plt.title('Correlation Between Tempo and Popularity')
    plt.xlabel('Tempo (BPM)')
    plt.ylabel('Popularity')
    plt.tight_layout()
    plt.show()
else:
    print("The dataset does not contain 'tempo' or 'popularity' columns for correlation analysis.")


Q5. Are there seasonal trends in the release of top-charting songs?

In [None]:
data['month'] = pd.to_datetime(data['date']).dt.month
monthly_counts = data['month'].value_counts().sort_index()
print("Monthly Distribution of Top-Charting Songs:")
print(monthly_counts)
plt.figure(figsize=(10, 6))
monthly_counts.plot(kind='bar', color='orange')
plt.title('Monthly Distribution of Top-Charting Songs')
plt.xlabel('Month')
plt.ylabel('Number of Songs')
plt.tight_layout()
plt.show()

Q6. Which year had the highest average popularity score for songs?

In [None]:
avg_popularity_per_year = data.groupby('year')['popularity'].mean()
most_popular_year = avg_popularity_per_year.idxmax()
print(f'The year with the highest average popularity score: {most_popular_year} with a score of {avg_popularity_per_year[most_popular_year]:.2f}')

Q7. What is the distribution of song durations?

In [None]:
plt.figure(figsize=(10, 6))
data['duration'].plot(kind='hist', bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Song Durations')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

Q8. How many unique artists appear each year?

In [None]:
unique_artists_per_year = data.groupby('year')['artist'].nunique()
print("Number of Unique Artists Per Year:")
print(unique_artists_per_year)
plt.figure(figsize=(10, 6))
unique_artists_per_year.plot(kind='line', marker='o', color='red')
plt.title('Unique Artists Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Unique Artists')
plt.grid()
plt.tight_layout()
plt.show()

Q9. What is the relationship between release year and genre diversity?

In [None]:
if 'genre' in data.columns:
    unique_genres_per_year = data.groupby('year')['genre'].nunique()
    print("Unique Genres Per Year:")
    print(unique_genres_per_year)
    plt.figure(figsize=(10, 6))
    unique_genres_per_year.plot(kind='line', marker='o', color='purple')
    plt.title('Unique Genres Per Year')
    plt.xlabel('Year')
    plt.ylabel('Number of Unique Genres')
    plt.grid()
    plt.tight_layout()
    plt.show()
else:
    print("Cannot determine genre diversity as the 'genre' column is missing.")


Q10. What is the correlation between song duration and popularity?

In [None]:
if 'duration' in data.columns and 'popularity' in data.columns:
    correlation = data['duration'].corr(data['popularity'])
    print(f'Correlation between song duration and popularity: {correlation:.2f}')
else:
    print("Cannot calculate correlation as 'duration' or 'popularity' columns are missing.")
