# PySpark ML

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder \
  .appName("BigData- Tarea4y5") \
  .getOrCreate()

In [9]:
from google.colab import drive
drive.mount('/content/drive')
# Información de origen de datos
ruta = '/content/drive/MyDrive/SpotifyFeatures.csv'
df = spark.read.csv(ruta, header=True, inferSchema=True, multiLine=True, escape='"')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
print("Filas originales:", df.count())
df.printSchema()
df.show(5)

Filas originales: 232725
root
 |-- genre: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- key: string (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- valence: double (nullable = true)

+-----+-----------------+--------------------+--------------------+----------+------------+------------+-----------+------+----------------+---+--------+--------+-----+-----------+-------+-------+
|genre|      artist_name|          track_name|            track_id|popularity|a

In [11]:
# Columnas numéricas
columnas_modelo = [
    "danceability",
    "energy",
    "loudness",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "popularity"
]


In [12]:
for c in columnas_modelo:
    df = df.withColumn(c, col(c).cast("double"))

In [13]:
# Limpieza de nulos
data = df.select(columnas_modelo).dropna()

print("Filas después de limpieza:", data.count())

Filas después de limpieza: 232725


In [14]:
# Vector Assembler
assembler = VectorAssembler(
    inputCols=columnas_modelo[:-1],  # todas menos popularity
    outputCol="features"
)

final_data = assembler.transform(data)
final_data = final_data.select("features", col("popularity").alias("label"))

final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.389,0.91,-1.82...|  0.0|
|[0.59,0.737,-5.55...|  1.0|
|[0.663,0.131,-13....|  3.0|
|[0.24,0.326,-12.1...|  0.0|
|[0.331,0.225,-21....|  4.0|
+--------------------+-----+
only showing top 5 rows


In [15]:
# Split train/test
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

print("Datos de entrenamiento:", train_data.count())
print("Datos de prueba:", test_data.count())

Datos de entrenamiento: 185919
Datos de prueba: 46806


In [16]:
# Entrenamiento de modelo
lr = LinearRegression()

model = lr.fit(train_data)

In [17]:
# Resultados
print("Intercept:", model.intercept)
print("Coeficientes:", model.coefficients)

Intercept: 55.79158358566945
Coeficientes: [17.72606142169144,-5.325037785249398,0.7086103720179455,-8.25175130838267,-11.895814043580847,-4.2466683657395325,-9.687773740861937,-13.36263201511825,-0.004328407742543541]


In [18]:
# Predicciones
predictions = model.transform(test_data)

predictions.select("label", "prediction").show(10)

+-----+------------------+
|label|        prediction|
+-----+------------------+
| 32.0| 30.62894183614558|
| 47.0|28.128575871119935|
| 38.0| 28.22374483617497|
| 28.0|22.486433205560886|
| 42.0|34.883498070128496|
| 26.0|26.052282029578457|
| 32.0|13.101413155721133|
| 34.0| 33.71129716208667|
| 32.0|13.585591342120296|
| 31.0|20.629014175821737|
+-----+------------------+
only showing top 10 rows


In [19]:
# Métricas
evaluation = model.evaluate(test_data)

print("RMSE:", evaluation.rootMeanSquaredError)
print("R2:", evaluation.r2)

RMSE: 15.922543660242061
R2: 0.23182524936273685
