# Spotify Music Analysis & Machine Learning Pipeline

## Comprehensive analysis of Spotify track features with:

- Exploratory Data Analysis (EDA)
- Feature Engineering
- Popularity Prediction Models
- Music Recommendation System
- Clustering & Pattern Discovery
- Interactive Visualizations

**Dataset includes:** track metadata + audio features (danceability, energy, tempo, valence, etc.)

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

## 1. Load and Explore Data

In [None]:
df = pd.read_csv('spotify_analysis_dataset.csv')

print(f"Loaded {len(df)} tracks")
print(f"Columns: {len(df.columns)}")
print(df.head())

## 2. Data Preprocessing

In [None]:
if 'duration_ms' in df.columns:
    df['duration_min'] = df['duration_ms'] / 60000

if 'release_date' in df.columns:
    df['release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year

## 3. Feature Engineering

In [None]:
audio_features = ['danceability','energy','loudness','speechiness',
                  'acousticness','instrumentalness','liveness','valence','tempo']
audio_features = [f for f in audio_features if f in df.columns]

df['mood_score'] = df['valence'] * 0.4 + df['energy'] * 0.3 + df['danceability'] * 0.3

## 4. Popularity Prediction

In [None]:
X = df[audio_features].dropna()
y = df.loc[X.index, 'popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print('R2:', r2_score(y_test, preds))

## 5. Clustering

In [None]:
X_cluster = StandardScaler().fit_transform(df[audio_features].dropna())

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(X_cluster)

plt.scatter(X_pca[:,0], X_pca[:,1], c=labels, cmap='tab10')
plt.title('Music Clusters (PCA)')
plt.show()