#RDDs



In [1]:
!pip install pyspark
from pyspark.sql import SparkSession



In [2]:
spark = SparkSession.builder \
  .appName("BigData- Tarea2") \
  .getOrCreate()
sc = spark.sparkContext

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Información de origen de datos
ruta = '/content/drive/MyDrive/SpotifyFeatures.csv'
df = spark.read.csv(ruta, header=True, inferSchema=True)

In [5]:
print("Columnas del dataset:")
df.printSchema()

print("Primeros registros:")
df.show(5)

Columnas del dataset:
root
 |-- genre: string (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- key: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- valence: string (nullable = true)

Primeros registros:
+-----+-----------------+--------------------+--------------------+----------+------------+------------+-----------+------+----------------+---+--------+--------+-----+-----------+-------+-------+
|genre|      artist_name|          track_name|            track_

In [6]:
# Converción del DataFrame a RDD
rdd_spotify = df.rdd
print("Número de registros en el RDD:", rdd_spotify.count())

Número de registros en el RDD: 232725


In [7]:
rdd_spotify.take(5)

[Row(genre='Movie', artist_name='Henri Salvador', track_name="C'est beau de faire un Show", track_id='0BRjO6ga9RKCKjfDqeFgWV', popularity='0', acousticness='0.611', danceability='0.389', duration_ms='99373', energy='0.91', instrumentalness='0', key='C#', liveness='0.346', loudness='-1.828', mode='Major', speechiness='0.0525', tempo='166.969', valence='0.814'),
 Row(genre='Movie', artist_name='Martin & les fées', track_name="Perdu d'avance (par Gad Elmaleh)", track_id='0BjC1NfoEOOusryehmNudP', popularity='1', acousticness='0.246', danceability='0.59', duration_ms='137373', energy='0.737', instrumentalness='0', key='F#', liveness='0.151', loudness='-5.559', mode='Minor', speechiness='0.0868', tempo='174.003', valence='0.816'),
 Row(genre='Movie', artist_name='Joseph Williams', track_name="Don't Let Me Be Lonely Tonight", track_id='0CoSDzoNIKCRs124s9uTVy', popularity='3', acousticness='0.952', danceability='0.663', duration_ms='170267', energy='0.131', instrumentalness='0', key='C', liven

In [8]:
# Operaciones con RDD

In [9]:
# Estadísticas básicas de popularidad
import re

def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
popularity_rdd = rdd_spotify.filter(lambda row: is_float(row['popularity'])) \
                            .map(lambda row: float(row['popularity']))


print("Popularidad mínima:", popularity_rdd.min())
print("Popularidad máxima:", popularity_rdd.max())
print("Popularidad promedio:", popularity_rdd.mean())

Popularidad mínima: 0.0
Popularidad máxima: 100.0
Popularidad promedio: 41.23781203604616


In [12]:
# Promedio de duración de canciones (en minutos)
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

duration_rdd = rdd_spotify.filter(lambda row: is_float(row['duration_ms'])) \
                            .map(lambda row: float(row['duration_ms']) / 60000)

print("Duración promedio (minutos):", duration_rdd.mean())

Duración promedio (minutos): 3.89967433689643


In [16]:
# Filtrar canciones con alta energía (>0.8)
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

high_energy = rdd_spotify.filter(lambda row: is_float(row['energy']) and float(row['energy']) > 0.8)
print("Canciones con alta energía:", high_energy.count())
high_energy.take(10)

Canciones con alta energía: 54716


[Row(genre='Movie', artist_name='Henri Salvador', track_name="C'est beau de faire un Show", track_id='0BRjO6ga9RKCKjfDqeFgWV', popularity='0', acousticness='0.611', danceability='0.389', duration_ms='99373', energy='0.91', instrumentalness='0', key='C#', liveness='0.346', loudness='-1.828', mode='Major', speechiness='0.0525', tempo='166.969', valence='0.814'),
 Row(genre='Movie', artist_name='Bernard Minet', track_name='Ultra Man 80', track_id='0x8xSaoSfQkOYUnG1nbga0', popularity='3', acousticness='0.488', danceability='0.744', duration_ms='178107', energy='0.953', instrumentalness='0', key='E', liveness='0.453', loudness='-4.986', mode='Major', speechiness='0.037', tempo='129.959', valence='0.926'),
 Row(genre='Movie', artist_name='Henri Salvador', track_name='Monsieur Boum Boum', track_id='14K25Ks5fdHjHfpIYOTc4y', popularity='8', acousticness='0.689', danceability='0.704', duration_ms='161773', energy='0.804', instrumentalness='0.0422', key='C', liveness='0.18', loudness='-6.699', mo

In [17]:
columns = df.columns
spotify_df_from_rdd = spark.createDataFrame(rdd_spotify, columns)
spotify_df_from_rdd.show(5)

+-----+-----------------+--------------------+--------------------+----------+------------+------------+-----------+------+----------------+---+--------+--------+-----+-----------+-------+-------+
|genre|      artist_name|          track_name|            track_id|popularity|acousticness|danceability|duration_ms|energy|instrumentalness|key|liveness|loudness| mode|speechiness|  tempo|valence|
+-----+-----------------+--------------------+--------------------+----------+------------+------------+-----------+------+----------------+---+--------+--------+-----+-----------+-------+-------+
|Movie|   Henri Salvador|C'est beau de fai...|0BRjO6ga9RKCKjfDq...|         0|       0.611|       0.389|      99373|  0.91|               0| C#|   0.346|  -1.828|Major|     0.0525|166.969|  0.814|
|Movie|Martin & les fées|Perdu d'avance (p...|0BjC1NfoEOOusryeh...|         1|       0.246|        0.59|     137373| 0.737|               0| F#|   0.151|  -5.559|Minor|     0.0868|174.003|  0.816|
|Movie|  Joseph