In [17]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
# Load the CSV file for music data
df = pd.read_csv('../data/raw_data/data_moods.csv')

In [None]:
# Preview data
df.head()

In [None]:
# Display basic info about the dataset
df.info()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Count the number of songs per mood
mood_counts = df['mood'].value_counts()
mood_counts

In [None]:
# Visualize the distribution of the 'mood' column
plt.figure(figsize=(8, 6))
sns.countplot(x='mood', data=df, palette='Set2')
plt.title('Distribution of Moods in Music Data')
plt.xlabel('Mood')
plt.ylabel('Number of Songs')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize the correlation between numerical features
plt.figure(figsize=(10, 8))
corr_matrix = df[['popularity', 'danceability', 'acousticness', 'energy', 'instrumentalness', 
                  'liveness', 'valence', 'loudness', 'speechiness', 'tempo']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Music Features')
plt.show()

In [None]:
# Visualizing mood distribution against energy and danceability
plt.figure(figsize=(12, 8))
sns.boxplot(x='mood', y='energy', data=df, palette='Set2')
plt.title('Energy Distribution by Mood')
plt.xlabel('Mood')
plt.ylabel('Energy')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='mood', y='danceability', data=df, palette='Set2')
plt.title('Danceability Distribution by Mood')
plt.xlabel('Mood')
plt.ylabel('Danceability')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualize the distribution of 'popularity' across different moods
plt.figure(figsize=(12, 8))
sns.boxplot(x='mood', y='popularity', data=df, palette='Set2')
plt.title('Popularity Distribution by Mood')
plt.xlabel('Mood')
plt.ylabel('Popularity')
plt.xticks(rotation=45)
plt.show()