#  🎯CodeAlpha Task2  -- 🎵 Spotify Music Analysis








In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from scipy import stats
from wordcloud import WordCloud

warnings.filterwarnings('ignore')

# setting up plot styles - i like this one
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


In [2]:

# If uploading to Colab, use: '/content/spotify_data.csv'
# If from Google Drive, mount drive first and use: '/content/drive/MyDrive/spotify_data.csv'

file_path = '/content/dataset.csv'  # <-- CHANGE THIS PATH

# Alternatively, upload file directly in Colab
from google.colab import files
print("Upload your Spotify dataset CSV file:")
uploaded = files.upload()
if uploaded:
    file_path = list(uploaded.keys())[0]
    print(f"✓ File uploaded: {file_path}")

Upload your Spotify dataset CSV file:


Saving dataset.csv to dataset.csv
✓ File uploaded: dataset.csv


In [3]:
# Load the data
try:
    df = pd.read_csv(file_path)
    print("✓ Dataset loaded successfully!")
    print(f"\nDataset shape: {df.shape[0]} rows × {df.shape[1]} columns")
except FileNotFoundError:
    print("❌ File not found! Please check the path or upload the file.")
    print("You can download dataset from Kaggle and upload it here.")

✓ Dataset loaded successfully!

Dataset shape: 114000 rows × 21 columns


In [4]:
# first look at the data
print("="*60)
print("FIRST 5 ROWS")
print("="*60)
print(df.head())

FIRST 5 ROWS
   Unnamed: 0                track_id                 artists  \
0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   

                                          album_name  \
0                                             Comedy   
1                                   Ghost (Acoustic)   
2                                     To Begin Again   
3  Crazy Rich Asians (Original Motion Picture Sou...   
4                                            Hold On   

                   track_name  popularity  duration_ms  explicit  \
0                      Comedy          73       230666     False   
1            Ghost - Acoustic          55       149610     False   
2              To Begin Again          57       210826 

In [5]:
# basic info about dataset
print("\n" + "="*60)
print("DATASET INFO")
print("="*60)
df.info()


DATASET INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  livene

In [6]:
# statistical summary
print("\n" + "="*60)
print("STATISTICAL SUMMARY")
print("="*60)
print(df.describe())


STATISTICAL SUMMARY
          Unnamed: 0     popularity   duration_ms   danceability  \
count  114000.000000  114000.000000  1.140000e+05  114000.000000   
mean    56999.500000      33.238535  2.280292e+05       0.566800   
std     32909.109681      22.305078  1.072977e+05       0.173542   
min         0.000000       0.000000  0.000000e+00       0.000000   
25%     28499.750000      17.000000  1.740660e+05       0.456000   
50%     56999.500000      35.000000  2.129060e+05       0.580000   
75%     85499.250000      50.000000  2.615060e+05       0.695000   
max    113999.000000     100.000000  5.237295e+06       0.985000   

              energy            key       loudness           mode  \
count  114000.000000  114000.000000  114000.000000  114000.000000   
mean        0.641383       5.309140      -8.258960       0.637553   
std         0.251529       3.559987       5.029337       0.480709   
min         0.000000       0.000000     -49.531000       0.000000   
25%         0.472000 

In [7]:
# check column names - sometimes they have weird names
print("\n" + "="*60)
print("COLUMN NAMES")
print("="*60)
print(df.columns.tolist())


COLUMN NAMES
['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre']


In [8]:
# check for missing values
print("="*60)
print("MISSING VALUES")
print("="*60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing Count': missing.values,
    'Percentage': missing_pct.values
})
print(missing_df[missing_df['Missing Count'] > 0])

# if no missing values
if missing.sum() == 0:
    print("✓ No missing values found! Data is clean.")

MISSING VALUES
       Column  Missing Count  Percentage
2     artists              1    0.000877
3  album_name              1    0.000877
4  track_name              1    0.000877


In [9]:
# check for duplicates
duplicates = df.duplicated().sum()
print(f"\n{'='*60}")
print(f"DUPLICATE ROWS: {duplicates}")
print("="*60)

if duplicates > 0:
    print(f"Removing {duplicates} duplicate rows...")
    df = df.drop_duplicates()
    print("✓ Duplicates removed!")
else:
    print("✓ No duplicates found!")


DUPLICATE ROWS: 0
✓ No duplicates found!


In [10]:
# data types check and conversion if needed
print("\n" + "="*60)
print("DATA TYPES")
print("="*60)
print(df.dtypes)

# sometimes year or release date needs conversion
# adjust based on your dataset columns
if 'release_date' in df.columns:
    try:
        df['release_date'] = pd.to_datetime(df['release_date'])
        df['year'] = df['release_date'].dt.year
        print("\n✓ Date columns converted successfully!")
    except:
        print("\n⚠ Could not convert release_date to datetime")


DATA TYPES
Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object


In [11]:
# analyzing popularity
print("="*60)
print("POPULARITY STATISTICS")
print("="*60)
if 'popularity' in df.columns:
    print(f"Mean Popularity: {df['popularity'].mean():.2f}")
    print(f"Median Popularity: {df['popularity'].median():.2f}")
    print(f"Most Popular Song: {df['popularity'].max()}")
    print(f"Least Popular Song: {df['popularity'].min()}")

    # plot popularity distribution
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=('Distribution of Song Popularity', 'Popularity Box Plot'))

    # Histogram
    fig.add_trace(go.Histogram(x=df['popularity'], nbinsx=50, marker_color='#1DB954'), row=1, col=1)
    fig.update_xaxes(title_text='Popularity Score', row=1, col=1)
    fig.update_yaxes(title_text='Number of Songs', row=1, col=1)

    # Boxplot
    fig.add_trace(go.Box(y=df['popularity'], name='Popularity', marker_color='#1DB954'), row=1, col=2)
    fig.update_yaxes(title_text='Popularity Score', row=1, col=2)

    fig.update_layout(title_text='Popularity Distribution and Box Plot', showlegend=False)

    fig.show()

    print("\n✓ Interactive plots generated for popularity distribution!")

POPULARITY STATISTICS
Mean Popularity: 33.24
Median Popularity: 35.00
Most Popular Song: 100
Least Popular Song: 0



✓ Interactive plots generated for popularity distribution!


In [12]:
# key audio features to analyze
audio_features = ['danceability', 'energy', 'loudness', 'speechiness',
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# check which features exist in our dataset
available_features = [f for f in audio_features if f in df.columns]

if available_features:
    print("="*60)
    print("AUDIO FEATURES SUMMARY")
    print("="*60)
    print(df[available_features].describe())

    # correlation heatmap - this is important!
    correlation_matrix = df[available_features + ['popularity']].corr()

    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.index,
        colorscale='RdBu', # Changed colorscale to a valid string
        zmin=-1, zmax=1,
        text=correlation_matrix.values.round(2),
        texttemplate="%{text}",
        hoverinfo='x+y+z'
    ))

    fig.update_layout(
        title='Audio Features Correlation Heatmap',
        xaxis_title='Audio Features',
        yaxis_title='Audio Features',
        margin=dict(l=10, r=10, t=50, b=10)
    )

    fig.show()

    print("\n✓ Interactive correlation heatmap generated!")

AUDIO FEATURES SUMMARY
        danceability         energy       loudness    speechiness  \
count  114000.000000  114000.000000  114000.000000  114000.000000   
mean        0.566800       0.641383      -8.258960       0.084652   
std         0.173542       0.251529       5.029337       0.105732   
min         0.000000       0.000000     -49.531000       0.000000   
25%         0.456000       0.472000     -10.013000       0.035900   
50%         0.580000       0.685000      -7.004000       0.048900   
75%         0.695000       0.854000      -5.003000       0.084500   
max         0.985000       1.000000       4.532000       0.965000   

        acousticness  instrumentalness       liveness        valence  \
count  114000.000000     114000.000000  114000.000000  114000.000000   
mean        0.314910          0.156050       0.213553       0.474068   
std         0.332523          0.309555       0.190378       0.259261   
min         0.000000          0.000000       0.000000       0.00000


✓ Interactive correlation heatmap generated!


In [13]:
# analyzing relationship between features and popularity
if 'popularity' in df.columns and available_features:
    fig = make_subplots(rows=3, cols=3,
                        subplot_titles=[f'{feature.capitalize()} vs Popularity' for feature in available_features],
                        vertical_spacing=0.08, horizontal_spacing=0.08)

    axes = [(i, j) for i in range(1, 4) for j in range(1, 4)]

    for idx, feature in enumerate(available_features):
        row, col = axes[idx]
        fig.add_trace(go.Scattergl(x=df[feature], y=df['popularity'], mode='markers',
                                   marker=dict(color='#1DB954', opacity=0.3, size=5),
                                   name=feature.capitalize()), row=row, col=col)

        # add trend line
        z = np.polyfit(df[feature].dropna(), df['popularity'][df[feature].notna()], 1)
        p = np.poly1d(z)
        fig.add_trace(go.Scattergl(x=df[feature], y=p(df[feature]), mode='lines',
                                   line=dict(color='red', dash='dash'),
                                   name=f'{feature.capitalize()} Trend'), row=row, col=col)


        fig.update_xaxes(title_text=feature.capitalize(), row=row, col=col)
        fig.update_yaxes(title_text='Popularity', row=row, col=col)


    fig.update_layout(title_text='Audio Features vs Popularity', showlegend=False, height=1200)
    fig.show()

    print("✓ Interactive feature analysis plots generated!")

Output hidden; open in https://colab.research.google.com to view.

In [14]:
# analyzing trends over years
if 'year' in df.columns:
    print("="*60)
    print("MUSIC EVOLUTION OVER TIME")
    print("="*60)

    # filter realistic years (sometimes data has errors)
    df_filtered = df[(df['year'] >= 1960) & (df['year'] <= 2024)]

    # group by decade
    df_filtered['decade'] = (df_filtered['year'] // 10) * 10

    decade_stats = df_filtered.groupby('decade')[available_features].mean()

    print("\nAverage Audio Features by Decade:")
    print(decade_stats)

    # plot trends over decades
    fig = make_subplots(rows=3, cols=3,
                        subplot_titles=[f'{feature.capitalize()} Trend Over Decades' for feature in available_features],
                        vertical_spacing=0.08, horizontal_spacing=0.08)

    axes = [(i, j) for i in range(1, 4) for j in range(1, 4)]

    for idx, feature in enumerate(available_features):
        row, col = axes[idx]
        decade_avg = df_filtered.groupby('decade')[feature].mean()
        fig.add_trace(go.Scatter(x=decade_avg.index, y=decade_avg.values, mode='lines+markers',
                                 marker=dict(color='#1DB954', size=8),
                                 line=dict(color='#1DB954'),
                                 name=feature.capitalize()), row=row, col=col)

        fig.update_xaxes(title_text='Decade', row=row, col=col)
        fig.update_yaxes(title_text=feature.capitalize(), row=row, col=col)

    fig.update_layout(title_text='Music Evolution Over Decades', showlegend=False, height=1000)
    fig.show()

    print("\n✓ Interactive decade trend analysis generated!")

In [20]:
# genre analysis
if 'genre' in df.columns or 'track_genre' in df.columns:
    genre_col = 'genre' if 'genre' in df.columns else 'track_genre'

    print("="*60)
    print("GENRE ANALYSIS")
    print("="*60)

    # top genres
    top_genres = df[genre_col].value_counts().head(15)
    print("\nTop 15 Genres:")
    print(top_genres)

    # plot top genres
    fig_genres = px.bar(top_genres, x=top_genres.values, y=top_genres.index, orientation='h',
                        title='Top 15 Music Genres',
                        labels={'x': 'Number of Tracks', 'y': 'Genre'},
                        color=top_genres.values, color_continuous_scale='Viridis')
    fig_genres.update_layout(yaxis={'categoryorder':'total ascending'})
    fig_genres.show()





GENRE ANALYSIS

Top 15 Genres:
track_genre
acoustic         1000
afrobeat         1000
alt-rock         1000
alternative      1000
ambient          1000
anime            1000
black-metal      1000
bluegrass        1000
blues            1000
brazil           1000
breakbeat        1000
british          1000
cantopop         1000
chicago-house    1000
children         1000
Name: count, dtype: int64


In [19]:
 # genre characteristics
genre_features = df.groupby(genre_col)[available_features].mean().head(10)

fig_heatmap = go.Figure(data=go.Heatmap(
        z=genre_features.values,
        x=genre_features.columns,
        y=genre_features.index,
        colorscale='YlOrRd',
        text=genre_features.values.round(2),
        texttemplate="%{text}",
        hoverinfo='x+y+z'
    ))

fig_heatmap.update_layout(
        title='Audio Features by Genre (Top 10)',
        xaxis_title='Audio Features',
        yaxis_title='Genre',
        margin=dict(l=10, r=10, t=50, b=10)
    )
fig_heatmap.show(

print("\n✓ Interactive genre analysis completed!")


✓ Interactive genre analysis completed!


In [21]:
# top artists analysis
if 'artists' in df.columns or 'artist' in df.columns:
    artist_col = 'artists' if 'artists' in df.columns else 'artist'

    print("="*60)
    print("ARTIST ANALYSIS")
    print("="*60)

    # sometimes artists column has multiple artists, let's handle that
    # taking first artist for simplicity
    df['main_artist'] = df[artist_col].astype(str).str.split(',').str[0].str.strip()

    top_artists = df['main_artist'].value_counts().head(20)
    print("\nTop 20 Most Frequent Artists:")
    print(top_artists)

    # plot
    fig = px.bar(top_artists, x=top_artists.values, y=top_artists.index, orientation='h',
                 title='Top 20 Artists by Number of Tracks',
                 labels={'x': 'Number of Tracks', 'y': 'Artist'},
                 color=top_artists.values, color_continuous_scale='Viridis')
    fig.update_layout(yaxis={'categoryorder':'total ascending'})
    fig.show()

    print("\n✓ Interactive artist analysis generated!")

ARTIST ANALYSIS

Top 20 Most Frequent Artists:
main_artist
The Beatles              279
George Jones             271
Stevie Wonder            236
Linkin Park              224
Ella Fitzgerald          222
Prateek Kuhad            217
Feid                     202
Chuck Berry              190
Håkan Hellström          183
OneRepublic              181
The Beach Boys           176
my little airport        171
Elvis Presley            169
Charlie Brown Jr.        169
Red Hot Chili Peppers    159
Bryan Adams              157
Scooter                  155
Daddy Yankee             154
Arctic Monkeys           152
BTS                      151
Name: count, dtype: int64



✓ Interactive artist analysis generated!


In [22]:
# song duration analysis
if 'duration_ms' in df.columns:
    print("="*60)
    print("SONG DURATION ANALYSIS")
    print("="*60)

    # convert to minutes for better understanding
    df['duration_min'] = df['duration_ms'] / (1000 * 60)

    print(f"Average Song Duration: {df['duration_min'].mean():.2f} minutes")
    print(f"Median Song Duration: {df['duration_min'].median():.2f} minutes")
    print(f"Shortest Song: {df['duration_min'].min():.2f} minutes")
    print(f"Longest Song: {df['duration_min'].max():.2f} minutes")

    # filter out outliers for better visualization
    q1 = df['duration_min'].quantile(0.25)
    q3 = df['duration_min'].quantile(0.75)
    iqr = q3 - q1
    df_duration = df[(df['duration_min'] >= q1 - 1.5*iqr) & (df['duration_min'] <= q3 + 1.5*iqr)]

    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=('Song Duration Distribution', 'Duration vs Popularity'))

    # Histogram
    fig.add_trace(go.Histogram(x=df_duration['duration_min'], nbinsx=50, marker_color='#1DB954'), row=1, col=1)
    fig.update_xaxes(title_text='Duration (minutes)', row=1, col=1)
    fig.update_yaxes(title_text='Number of Songs', row=1, col=1)

    # Scatter plot
    fig.add_trace(go.Scattergl(x=df_duration['duration_min'], y=df_duration['popularity'], mode='markers',
                               marker=dict(color='#1DB954', opacity=0.3, size=5)), row=1, col=2)
    fig.update_xaxes(title_text='Duration (minutes)', row=1, col=2)
    fig.update_yaxes(title_text='Popularity', row=1, col=2)


    fig.update_layout(title_text='Song Duration Analysis', showlegend=False)
    fig.show()

    print("\n✓ Interactive duration analysis plots generated!")

Output hidden; open in https://colab.research.google.com to view.

In [23]:
# this is interesting - do danceable songs have high energy?
if 'danceability' in df.columns and 'energy' in df.columns:
    print("="*60)
    print("ENERGY VS DANCEABILITY")
    print("="*60)

    # correlation
    corr = df['energy'].corr(df['danceability'])
    print(f"Correlation between Energy and Danceability: {corr:.3f}")

    # scatter plot with density
    fig = go.Figure(data=go.Densitymapbox(lat=df['danceability'], lon=df['energy'], z=df['popularity'],
                                           radius=10))
    fig.update_layout(mapbox_style="stamen-terrain", mapbox_center_lon=180)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})


    fig = px.density_heatmap(df, x='danceability', y='energy',
                             title='Energy vs Danceability (Density Heatmap)')
    fig.update_xaxes(title_text='Danceability')
    fig.update_yaxes(title_text='Energy')
    fig.show()

    # categorize songs
    df['song_type'] = 'Average'
    df.loc[(df['energy'] > 0.7) & (df['danceability'] > 0.7), 'song_type'] = 'High Energy Dance'
    df.loc[(df['energy'] < 0.4) & (df['danceability'] < 0.4), 'song_type'] = 'Calm & Slow'
    df.loc[(df['energy'] > 0.7) & (df['danceability'] < 0.4), 'song_type'] = 'Energetic Non-Dance'
    df.loc[(df['energy'] < 0.4) & (df['danceability'] > 0.7), 'song_type'] = 'Chill Dance'

    print("\nSong Type Distribution:")
    print(df['song_type'].value_counts())

    print("\n✓ Interactive Energy vs Danceability analysis complete!")

ENERGY VS DANCEABILITY
Correlation between Energy and Danceability: 0.134



Song Type Distribution:
song_type
Average                83477
High Energy Dance      12475
Energetic Non-Dance     8112
Calm & Slow             7133
Chill Dance             2803
Name: count, dtype: int64

✓ Interactive Energy vs Danceability analysis complete!


In [24]:
# valence represents musical positivity/happiness
if 'valence' in df.columns:
    print("="*60)
    print("MOOD ANALYSIS (VALENCE)")
    print("="*60)

    # categorize by mood
    df['mood'] = pd.cut(df['valence'], bins=[0, 0.33, 0.66, 1.0],
                        labels=['Sad/Dark', 'Neutral', 'Happy/Cheerful'])

    mood_counts = df['mood'].value_counts()
    print("\nMood Distribution:")
    print(mood_counts)

    # pie chart and bar chart
    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'xy'}]],
                        subplot_titles=('Song Mood Distribution', 'Average Popularity by Mood'))

    # Pie chart
    fig.add_trace(go.Pie(labels=mood_counts.index, values=mood_counts.values, name='Mood',
                         marker=dict(colors=['#FF6B6B', '#FFA07A', '#90EE90']),
                         pull=[0.05, 0.05, 0.05]), 1, 1)

    # Mood vs popularity bar chart
    mood_pop = df.groupby('mood')['popularity'].mean().sort_values(ascending=False)
    fig.add_trace(go.Bar(x=mood_pop.index, y=mood_pop.values, name='Average Popularity',
                         marker=dict(color=['#FF6B6B', '#FFA07A', '#90EE90'])), 1, 2)
    fig.update_xaxes(title_text='Mood', row=1, col=2)
    fig.update_yaxes(title_text='Average Popularity', row=1, col=2)


    fig.update_layout(title_text='Mood Analysis', showlegend=False)
    fig.show()

    print("\n✓ Interactive mood analysis generated!")

MOOD ANALYSIS (VALENCE)

Mood Distribution:
mood
Neutral           44446
Sad/Dark          38031
Happy/Cheerful    31347
Name: count, dtype: int64



✓ Interactive mood analysis generated!


In [25]:
# beats per minute analysis
if 'tempo' in df.columns:
    print("="*60)
    print("TEMPO (BPM) ANALYSIS")
    print("="*60)

    print(f"Average Tempo: {df['tempo'].mean():.2f} BPM")
    print(f"Median Tempo: {df['tempo'].median():.2f} BPM")

    # categorize tempo
    df['tempo_category'] = pd.cut(df['tempo'],
                                   bins=[0, 90, 120, 150, 250],
                                   labels=['Slow', 'Moderate', 'Fast', 'Very Fast'])

    tempo_counts = df['tempo_category'].value_counts()
    print("\nTempo Distribution:")
    print(tempo_counts)

    fig = make_subplots(rows=1, cols=3,
                        subplot_titles=('Tempo Distribution', 'Songs by Tempo Category', 'Tempo vs Popularity'),
                        column_widths=[0.4, 0.3, 0.3])

    # Histogram
    fig.add_trace(go.Histogram(x=df['tempo'], nbinsx=50, marker_color='#1DB954'), row=1, col=1)
    fig.update_xaxes(title_text='Tempo (BPM)', row=1, col=1)
    fig.update_yaxes(title_text='Number of Songs', row=1, col=1)
    fig.add_vline(x=df['tempo'].mean(), line_dash="dash", line_color="red", row=1, col=1,
                  annotation_text=f'Mean: {df["tempo"].mean():.1f} BPM', annotation_position="top right")


    # Categories bar chart
    fig.add_trace(go.Bar(x=tempo_counts.index, y=tempo_counts.values, marker_color='#1DB954'), row=1, col=2)
    fig.update_xaxes(title_text='Tempo Category', row=1, col=2)
    fig.update_yaxes(title_text='Number of Songs', row=1, col=2)


    # Tempo vs Popularity scatter plot
    fig.add_trace(go.Scattergl(x=df['tempo'], y=df['popularity'], mode='markers',
                               marker=dict(color='#1DB954', opacity=0.3, size=5)), row=1, col=3)
    fig.update_xaxes(title_text='Tempo (BPM)', row=1, col=3)
    fig.update_yaxes(title_text='Popularity', row=1, col=3)


    fig.update_layout(title_text='Tempo Analysis', showlegend=False)
    fig.show()

    print("\n✓ Interactive tempo analysis generated!")

TEMPO (BPM) ANALYSIS
Average Tempo: 122.15 BPM
Median Tempo: 122.02 BPM

Tempo Distribution:
tempo_category
Fast         40753
Moderate     36935
Very Fast    19910
Slow         16245
Name: count, dtype: int64



✓ Interactive tempo analysis generated!


In [26]:
print("="*60)
print("STATISTICAL HYPOTHESIS TESTING")
print("="*60)

# Hypothesis 1: Do high-energy songs have higher popularity?
if 'energy' in df.columns and 'popularity' in df.columns:
    high_energy = df[df['energy'] > 0.7]['popularity']
    low_energy = df[df['energy'] <= 0.7]['popularity']

    t_stat, p_value = stats.ttest_ind(high_energy, low_energy)

    print("\nHypothesis 1: High-energy songs are more popular")
    print(f"High Energy Avg Popularity: {high_energy.mean():.2f}")
    print(f"Low Energy Avg Popularity: {low_energy.mean():.2f}")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    if p_value < 0.05:
        print("✓ Result: Significant difference! Energy affects popularity.")
    else:
        print("✗ Result: No significant difference.")

# Hypothesis 2: Are happier songs (high valence) more popular?
if 'valence' in df.columns and 'popularity' in df.columns:
    happy_songs = df[df['valence'] > 0.7]['popularity']
    sad_songs = df[df['valence'] <= 0.3]['popularity']

    t_stat, p_value = stats.ttest_ind(happy_songs, sad_songs)

    print("\nHypothesis 2: Happy songs are more popular than sad songs")
    print(f"Happy Songs Avg Popularity: {happy_songs.mean():.2f}")
    print(f"Sad Songs Avg Popularity: {sad_songs.mean():.2f}")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    if p_value < 0.05:
        print("✓ Result: Significant difference! Mood affects popularity.")
    else:
        print("✗ Result: No significant difference.")

# Hypothesis 3: Do danceable songs get more popular?
if 'danceability' in df.columns and 'popularity' in df.columns:
    high_dance = df[df['danceability'] > 0.7]['popularity']
    low_dance = df[df['danceability'] <= 0.7]['popularity']

    t_stat, p_value = stats.ttest_ind(high_dance, low_dance)

    print("\nHypothesis 3: Danceable songs are more popular")
    print(f"High Danceability Avg Popularity: {high_dance.mean():.2f}")
    print(f"Low Danceability Avg Popularity: {low_dance.mean():.2f}")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    if p_value < 0.05:
        print("✓ Result: Significant difference! Danceability affects popularity.")
    else:
        print("✗ Result: No significant difference.")

print("\n✓ Statistical analysis complete!")

STATISTICAL HYPOTHESIS TESTING

Hypothesis 1: High-energy songs are more popular
High Energy Avg Popularity: 32.60
Low Energy Avg Popularity: 33.82
T-statistic: -9.2223
P-value: 0.0000
✓ Result: Significant difference! Energy affects popularity.

Hypothesis 2: Happy songs are more popular than sad songs
Happy Songs Avg Popularity: 30.80
Sad Songs Avg Popularity: 33.64
T-statistic: -15.9103
P-value: 0.0000
✓ Result: Significant difference! Mood affects popularity.

Hypothesis 3: Danceable songs are more popular
High Danceability Avg Popularity: 32.60
Low Danceability Avg Popularity: 33.44
T-statistic: -5.4491
P-value: 0.0000
✓ Result: Significant difference! Danceability affects popularity.

✓ Statistical analysis complete!


In [27]:
print("="*70)
print(" "*20 + "KEY INSIGHTS")
print("="*70)

insights = []

# Finding 1: Most popular features
if 'popularity' in df.columns and available_features:
    correlations = df[available_features].corrwith(df['popularity']).sort_values(ascending=False)
    top_feature = correlations.index[0]
    top_corr = correlations.values[0]
    insights.append(f"1. {top_feature.capitalize()} has the strongest correlation ({top_corr:.3f}) with popularity")

# Finding 2: Optimal duration
if 'duration_min' in df.columns and 'popularity' in df.columns:
    popular_songs = df[df['popularity'] > df['popularity'].quantile(0.75)]
    avg_duration = popular_songs['duration_min'].mean()
    insights.append(f"2. Popular songs average {avg_duration:.2f} minutes in duration")

# Finding 3: Energy-Danceability sweet spot
if 'energy' in df.columns and 'danceability' in df.columns:
    sweet_spot = df[(df['energy'] > 0.6) & (df['danceability'] > 0.6)]
    if len(sweet_spot) > 0:
        avg_pop_sweet = sweet_spot['popularity'].mean()
        insights.append(f"3. Songs with high energy AND danceability (>0.6) have avg popularity of {avg_pop_sweet:.2f}")

# Finding 4: Mood preference
if 'mood' in df.columns:
    best_mood = df.groupby('mood')['popularity'].mean().idxmax()
    insights.append(f"4. {best_mood} songs tend to be most popular on average")

# Finding 5: Decade trends
if 'decade' in df.columns:
    recent_decade = df['decade'].max()
    insights.append(f"5. Most recent decade in dataset: {recent_decade}s - music is getting more digital")

# Print all insights
for insight in insights:
    print(f"\n{insight}")

print("\n" + "="*70)

                    KEY INSIGHTS

1. Loudness has the strongest correlation (0.050) with popularity

2. Popular songs average 3.67 minutes in duration

3. Songs with high energy AND danceability (>0.6) have avg popularity of 32.13

4. Neutral songs tend to be most popular on average



In [28]:
# save cleaned dataset with new features
output_filename = 'spotify_analyzed_data.csv'
df.to_csv(output_filename, index=False)
print(f"✓ Analyzed dataset saved as '{output_filename}'")

✓ Analyzed dataset saved as 'spotify_analyzed_data.csv'
