1. EDA

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.sql.functions import col

In [0]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

df_silver = spark.read.table("default.spotify_silver")

pdf = df_silver.toPandas()

display(pdf.head())

In [0]:
plt.figure(figsize=(10, 5))
sns.histplot(pdf['popularity'], bins=30, kde=True, color='skyblue')

plt.title('Distribution of Track Popularity', fontsize=15)
plt.xlabel('Popularity Score (0-100)', fontsize=12)
plt.ylabel('Count of Tracks', fontsize=12)

plt.axvline(pdf['popularity'].mean(), color='red', linestyle='--', label=f'Mean: {pdf["popularity"].mean():.2f}')
plt.legend()

plt.show()

In [0]:
segment_counts = pdf['popularity_segment'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%', 
        colors=['skyblue','pink','lightgreen'], startangle=90)

plt.title('Distribution of Popularity Segments', fontsize=15)
plt.show()

In [0]:
plt.figure(figsize=(12, 4))
sns.boxplot(x=pdf['duration_minutes'], color='skyblue')

plt.title('Boxplot of Track Duration (Minutes)', fontsize=15)
plt.xlabel('Minutes', fontsize=12)

plt.xlim(0, 15)

plt.show()

2. Кореляція

In [0]:
cols_for_corr = ['popularity', 'duration_minutes', 'danceability', 'energy', 
                 'loudness', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo']

corr_matrix = pdf[cols_for_corr].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='GnBu', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Audio Features', fontsize=16)
plt.show()

3. Аналіз трендів

In [0]:
top_genres = pdf.groupby('track_genre')['popularity'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_genres.values, y=top_genres.index, color='skyblue')
plt.title('Top 10 Genres by Average Popularity', fontsize=15)
plt.xlabel('Average Popularity')
plt.show()

plt.figure(figsize=(6, 5))
sns.barplot(x='is_explicit_int', y='popularity', data=pdf, palette='pastel')
plt.title('Average Popularity: Explicit (1) vs Clean (0)', fontsize=15)
plt.xticks([0, 1], ['Clean', 'Explicit'])
plt.show()

4. Кластеризація K-Means

In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans

feature_cols = ['energy', 'valence', 'danceability', 'acousticness']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

df_ml = assembler.transform(df_silver)

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(df_ml)
df_scaled = scalerModel.transform(df_ml)

kmeans = KMeans(featuresCol="scaledFeatures", k=3, seed=1)
model = kmeans.fit(df_scaled)

predictions = model.transform(df_scaled)
pdf_clusters = predictions.select('energy', 'danceability', 'prediction').toPandas()

plt.figure(figsize=(10, 8))
sns.scatterplot(data=pdf_clusters, x='energy', y='danceability', hue='prediction', palette='pastel', alpha=0.6)
plt.title('K-Means Clustering of Songs (Energy vs Danceability)', fontsize=15)
plt.show()

5. Статистичне тестування гіпотез

H0 (Нульова гіпотеза): Різниці в популярності між піснями з матами (Explicit) і без них (Clean) немає.

H1 (Альтернативна гіпотеза): Різниця є.

In [0]:
from scipy import stats

explicit_pop = pdf[pdf['is_explicit_int'] == 1]['popularity']
clean_pop = pdf[pdf['is_explicit_int'] == 0]['popularity']

t_stat, p_value = stats.ttest_ind(explicit_pop, clean_pop)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4e}")

if p_value < 0.05:
    print("Різниця статистично значуща - відкидаємо H0")
else:
    print("Різниця випадкова - не можемо відкинути H0")

6. Що робить пісню хітом?

In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

feature_cols_rf = ['duration_minutes', 'danceability', 'energy', 
                   'loudness', 'speechiness', 'acousticness', 
                   'liveness', 'valence', 'tempo']

assembler_rf = VectorAssembler(inputCols=feature_cols_rf, outputCol="features")
data_rf = assembler_rf.transform(df_silver)

rf = RandomForestRegressor(featuresCol="features", labelCol="popularity", numTrees=20)
model_rf = rf.fit(data_rf)

import pandas as pd
feature_importance = pd.DataFrame({
    'Feature': feature_cols_rf,
    'Importance': model_rf.featureImportances.toArray()
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance, color='skyblue')
plt.title('What drives Popularity? (Feature Importance)', fontsize=15)
plt.show()