In [None]:

# Time Series Analysis and Forecasting of Spotify Trends

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from sklearn.cluster import KMeans
from scipy.stats import zscore
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load and Preprocess Data ---
df = pd.read_csv("spotify_data.csv")
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['year'] = df['release_date'].dt.year
df = df[df['year'].between(2000, 2024)]

# --- 2. Genre Popularity Over Time ---
genre_trend = df.groupby(['year', 'playlist_genre']).size().unstack(fill_value=0)
genre_trend_percent = genre_trend.divide(genre_trend.sum(axis=1), axis=0)

plt.figure(figsize=(14, 6))
genre_trend_percent.plot.area(ax=plt.gca(), cmap='tab20')
plt.title("Genre Popularity Over Time")
plt.ylabel("Proportion")
plt.xlabel("Year")
plt.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
plt.tight_layout()
plt.show()

# --- 3. Music Characteristics Over Time ---
features = ['valence', 'danceability', 'energy', 'tempo']
feature_trends = df.groupby('year')[features].mean()

for feat in features:
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=feature_trends, x=feature_trends.index, y=feat)
    plt.title(f"Average {feat} Over Time")
    plt.ylabel(feat.capitalize())
    plt.xlabel("Year")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# --- 4. Forecasting (ARIMA) ---
for feat in features:
    ts = feature_trends[feat]
    adf_result = adfuller(ts)
    print(f"{feat} ADF p-value: {adf_result[1]:.4f}")
    model = ARIMA(ts, order=(1, 1, 1))
    model_fit = model.fit()
    pred = model_fit.forecast(steps=5)

    plt.figure(figsize=(10, 5))
    plt.plot(ts, label="Historical")
    plt.plot(range(2025, 2030), pred, label="Forecast", linestyle='--')
    plt.title(f"Forecast of {feat}")
    plt.xlabel("Year")
    plt.ylabel(feat.capitalize())
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# --- 5. Identify Musical Eras via Clustering ---
z_scores = zscore(feature_trends[features])
kmeans = KMeans(n_clusters=4, random_state=42).fit(z_scores)
feature_trends['era'] = kmeans.labels_

# Annotated cluster centers for interpretation
era_centers = pd.DataFrame(kmeans.cluster_centers_, columns=features)
era_centers.index.name = 'Era'
print("\nCluster Centers by Feature (standardized):")
print(era_centers)

plt.figure(figsize=(12, 6))
sns.heatmap(z_scores.T, cmap='coolwarm', annot=True, xticklabels=feature_trends.index)
plt.title("Standardized Features Over Time")
plt.xlabel("Year")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 4))
sns.scatterplot(x=feature_trends.index, y=feature_trends['valence'], hue=feature_trends['era'], palette='Set2')
plt.title("Era Segmentation by Valence")
plt.xlabel("Year")
plt.ylabel("Valence")
plt.grid(True)
plt.tight_layout()
plt.show()

# --- 5b. Genre Distribution by Era ---
df = df.merge(feature_trends['era'], on='year', how='left')
genre_era_dist = df.groupby(['era', 'playlist_genre']).size().unstack(fill_value=0)
genre_era_percent = genre_era_dist.divide(genre_era_dist.sum(axis=1), axis=0)

plt.figure(figsize=(14, 6))
genre_era_percent.T.plot(kind='bar', stacked=True, colormap='tab20', figsize=(14,6))
plt.title("Genre Composition per Musical Era")
plt.ylabel("Proportion")
plt.xlabel("Genre")
plt.legend(title='Era', bbox_to_anchor=(1.0, 1.0))
plt.tight_layout()
plt.show()