In [1]:
import os
import librosa
import numpy as np
import sagemaker_pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,StringType, FloatType

In [2]:
from pyspark import SparkContext, SparkConf
from sagemaker_pyspark import classpath_jars
from pyspark.sql.functions import create_map, struct
from pydub import AudioSegment


In [3]:
os.environ["PYTHONIOENCODING"] = "utf-8"

In [4]:
RAW = 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\'
LANDED = 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\landed\\'

In [5]:
classpath = ":".join(sagemaker_pyspark.classpath_jars())

builder = SparkSession.builder.appName("MUSIC SPARK")
builder.config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
builder.config("spark.speculation", "false")
builder.config("spark.sql.parquet.compression.codec", "gzip")
builder.config("spark.debug.maxToStringFields", "100")
builder.config("spark.driver.extraClassPath", classpath)
builder.config("spark.driver.memory", "1g")
builder.config("spark.driver.cores", "1")
builder.config("spark.executor-memory", "20g")
builder.config("spark.executor.cores", "4")

builder.master("local[*]")

spark = builder.getOrCreate()
spark

In [6]:
def audio_to_wav(file):
    dst = file.replace('.mp3',".wav")
    sound = AudioSegment.from_mp3(file)
    sound.export(dst, format="wav")
    return dst

In [7]:
def extract_music_features(file):
    songname = file.split('/')[0::-1][0]
    y, sr = librosa.load(file, mono=True, duration=30)
    chroma_stft = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    
    chroma_stft = np.array2string(chroma_stft, precision=4, separator=',',suppress_small=True)
    spec_cent = np.array2string(spec_cent, precision=4, separator=',',suppress_small=True)
    spec_bw = np.array2string(spec_bw, precision=4, separator=',',suppress_small=True)
    rolloff = np.array2string(rolloff, precision=4, separator=',',suppress_small=True)
    zcr = np.array2string(zcr, precision=4, separator=',',suppress_small=True)
    
    
    to_append = f'{songname};{chroma_stft};{spec_cent};{spec_bw};{rolloff};{zcr}'    
    return to_append

In [8]:
all_music = [f'{RAW}{file}' for file in os.listdir(RAW) if '.mp3' in file]

In [9]:
all_music

['F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Aux Fox - Ellie Goulding - Flux (Aux Fox Remix).mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Cardi B - Money.mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Flipp Dinero - Leave Me Alone (Prod. by Young Forever x Cast Beats).mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\HIGH ON MUSIC - Danrell x Småland - Hostage.mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Kodak Black - ZEZE (feat. Travis Scott and Offset).mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Megan Thee Stallion - Cash Shit feat. DaBaby.mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\nymano - jazz and rain.mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\PRDSEOHNO - OHNO - Lil Mama (prod. Fallen Roses and B Dom).mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\wūsh - late nights with you.mp3',
 'F:\\Sistema\\Downloads\\bigdatamusic_datas\\raw\\Young Nero - Beyond (Prod. Scott Storch).mp3',
 'F:\\S

In [10]:
pipe_rdd_csv = spark.sparkContext.parallelize(all_music).map(audio_to_wav).map(extract_music_features)


In [49]:
"""
musics_infos = []

for music in all_music:
    #print(music)
    music_as_wav = audio_to_wav(music)
    #print(music_as_wav)
    info_from_music = extract_music_features(music_as_wav)
    print(info_from_music)
    musics_infos.append(info_from_music)
musics_infos
"""

'\nmusics_infos = []\n\nfor music in all_music:\n    #print(music)\n    music_as_wav = audio_to_wav(music)\n    #print(music_as_wav)\n    info_from_music = extract_music_features(music_as_wav)\n    print(info_from_music)\n    musics_infos.append(info_from_music)\nmusics_infos\n'

In [11]:
pipe_rdd_csv

PythonRDD[1] at RDD at PythonRDD.scala:52

In [12]:
schema = StructType([StructField('file_name', StringType(), True),
                     StructField('chroma', StringType(), True),
                     StructField('spec_cent', StringType(), True),
                     StructField('spec_bw', StringType(), True),
                     StructField('rolloff', StringType(), True),
                     StructField('zcr', StringType(), True)])

In [13]:
pipe_rdd_csv = pipe_rdd_csv.map(lambda x : x.split(";"))
rdd = spark.createDataFrame(pipe_rdd_csv,schema)
rdd.show()

+--------------------+------+---------+---------+---------+------+
|           file_name|chroma|spec_cent|  spec_bw|  rolloff|   zcr|
+--------------------+------+---------+---------+---------+------+
|F:\Sistema\Downlo...|0.3116|1244.5148|1479.5486|  2472.76|0.0505|
|F:\Sistema\Downlo...| 0.435|3329.2385|2701.4645|6364.7698|0.1513|
|F:\Sistema\Downlo...|0.2684|2497.0544|2253.6799| 4848.879|0.1262|
|F:\Sistema\Downlo...|0.3248|1674.9177|1946.9268|3331.0049|0.0701|
|F:\Sistema\Downlo...|0.4021|2166.0511|2454.4593|5006.5197|0.0671|
|F:\Sistema\Downlo...|0.4718|3270.8896|2589.2236|6275.8037|0.1793|
|F:\Sistema\Downlo...|0.3538|1545.9184|1932.9984|3007.3318|0.0635|
|F:\Sistema\Downlo...|0.3076|1087.3312|1491.5607|2050.4709|0.0426|
|F:\Sistema\Downlo...|0.3414| 681.9384| 958.0593|1068.1102|0.0414|
|F:\Sistema\Downlo...|0.3283|1608.4428|1925.6445|3293.9051|0.0667|
|F:\Sistema\Downlo...|0.3255|1926.8884|2065.4022|4144.5833|0.0705|
|F:\Sistema\Downlo...|0.4257|1935.1323|2060.6939|3853.1267|0.0

In [14]:
rdd.write.csv(LANDED,sep=';',mode='overwrite')