In [None]:
#################################################################################
# Laden aller relevate Module
#################################################################################
import pyspark
from pyspark.sql.functions import *
import json
import csv
from datetime import datetime
from delta import *
import delta

# use 95% of the screen for jupyter cell
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important; }<style>"))

In [None]:
# first for local usage pip install delta-spark

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars", "/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.driver.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar") \
    .config("spark.executor.extraClassPath","/Users/alor/opt/spark/jars/spark-sql-kafka-0-10_2.12-3.3.1.jar, /Users/alor/opt/spark/jars/kafka-clients-3.3.1.jar, /Users/alor/opt/spark/jars/spark-avro_2.12-3.3.1.jar")


spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc=spark.sparkContext

spark.sparkContext.setLogLevel("ERROR")



In [None]:
spark

# Bespiel 1: Tabelle Drop, Filter, Sort

## Dataframe - Drop, Filter, Sort

In [None]:
df=(spark
    .read
    .format("csv")
    .option("header",True)
    .option("delimiter",";")
    .load("../2_lab/data/simple/")
   )

df.show(10)

In [None]:
df2=(df
     .where(col("anzahl")>40)
     .sort(col("name").desc())
     .drop("date")
    )

df2.show(5)

In [None]:
df2.createOrReplaceTempView("df_view")

spark.sql("""
    SELECT name, anzahl
    FROM df_view
    WHERE anzahl > 40
    ORDER BY name DESC
    LIMIT 5
    
""").show()



## RDD - Drop, Filter, Sort

In [None]:
rdd=sc.textFile("../2_lab/data/simple/")

# Zeilen von Strings
rdd.take(10)

In [None]:
header=rdd.first()
rdd2=(rdd
      # remove first line
      .filter(lambda line: line != header )
      # split string into array
      .map(lambda line: line.split(";"))
      # change second element in array to int, drittes Element entfernen
      .map(lambda x: [x[0],int(x[1])])
      # filter on second array element (anzahl) < 40
      .filter(lambda x: x[1]>40)
      # sortby first array element descending
      .sortBy(lambda x: x[0],False)
     )
rdd2.take(5)

In [None]:
rdd2.toDF(["name","anzahl"]).show(5)

# Beispiel 2: Word Count Fließtext

## RDD - reduce by key

In [None]:
rdd4=sc.textFile("../2_lab/data/word-count/es-ist-nacht.txt")

# Zeilen von Strings
rdd4.take(10)

In [None]:
rdd5=(rdd4
      # splitte alles in Einzelworte und einen langen Datensatz
      .flatMap(lambda line: line.split(" "))
      # mape jedes Wort auf ein Tupel mit 1
      .map(lambda word: (word,1))
      # alles über Keys addieren/reduzieren
      .reduceByKey(lambda x,y: x+y)
      # über key sortieren
      .sortByKey(False)
      # über zweite Spalte sortieren
      .sortBy(lambda x: x[1],False)
)

rdd5.take(10)

### Dataframe explode und groupby

In [None]:
df4=(spark
    .read
    .format("text")
    .load("../2_lab/data/word-count/es-ist-nacht.txt")
   )

df4.show(10, truncate=False)

In [None]:
df5=(df4
     # zeile in array splitten
     .withColumn("words",split(col("value")," "))
     # array in zeile exploden
     .withColumn("word",explode(col("words")))
     # nach gleichen Wörtern gruppieren und addieren
     .groupBy("word")
     .count()
     # sortieren nach count und word
     .sort(col("count").desc(),col("word").desc())
    )

df5.show(10,truncate=False)

# Beispiel 3: Audiodatenverarbeitung mit RDD

In [None]:
import numpy as np
from scipy.io.wavfile import write
from IPython.display import Audio
import librosa

In [None]:
#%pip install scipy
#%pip install librosa

In [None]:
# Daten generierung 

sr = 22050
t = np.linspace(0, 2, 2*sr, endpoint=False)
tone = 0.5 * np.sin(2 * np.pi * 440 * t)  # A4 tone

write("a440.wav", sr, tone.astype(np.float32))


In [None]:
Audio("a440.wav")

In [None]:
import numpy as np
from IPython.display import Audio, display
import matplotlib.pyplot as plt

sr = 22050  # Sample rate
duration = 4.0  # seconds
t = np.linspace(0, duration, int(sr * duration), endpoint=False)

def generate_pulse(freq, beat_interval, pulse_duration=0.05):
    """Generate periodic pulses at a given interval and tone frequency."""
    signal = np.zeros_like(t)
    beat_samples = int(beat_interval * sr)
    pulse_samples = int(pulse_duration * sr)
    for i in range(0, len(t), beat_samples):
        end = min(i + pulse_samples, len(t))
        signal[i:end] += np.sin(2 * np.pi * freq * t[i:end])
    return signal * 0.5

# Define 10 different patterns
examples = [
    {"label": "Two Pulse Pitch", "freqs": [300, 220], "interval": 0.5,"pulse_duration": 0.05},
    {"label": "Slow 440Hz Pulse", "freq": 440, "interval": 0.6, "pulse_duration": 0.05},
    {"label": "Double Pulse", "freq": 440, "interval": 0.4, "pulse_duration": 0.05},
    {"label": "Triple Pulse", "freq": 440, "interval": 0.3, "pulse_duration": 0.05},
    {"label": "Low Beat 110Hz", "freq": 110, "interval": 0.5, "pulse_duration": 0.05},
    {"label": "High Beat 880Hz", "freq": 880, "interval": 0.5, "pulse_duration": 0.05},
    {"label": "Short Fast Beat", "freq": 440, "interval": 0.1, "pulse_duration": 0.01},
    {"label": "Long Slow Beat", "freq": 440, "interval": 0.8, "pulse_duration": 0.2},
    #{"label": "Increasing Tempo", "freq": 440, "intervals": [0.6, 0.4, 0.2, 0.1], "pulse_duration": 0.05},
    {"label": "Alternating Pitch", "freqs": [220, 440, 660], "interval": 0.3, "pulse_duration": 0.05}
]

# Generate and display
for i, ex in enumerate(examples):
    if "intervals" in ex:
        # Tempo changes
        y = np.zeros_like(t)
        idx = 0
        for interval in ex["intervals"]:
            beat_samples = int(interval * sr)
            pulse = np.sin(2 * np.pi * ex["freq"] * t[:beat_samples])
            pulse = np.pad(pulse, (idx, len(t) - idx - len(pulse)), 'constant')
            y += pulse * 0.3
            idx += beat_samples
    elif "freqs" in ex:
        # Alternate pitches
        y = np.zeros_like(t)
        beat_samples = int(ex["interval"] * sr)
        for i in range(0, len(t), beat_samples):
            freq = ex["freqs"][(i // beat_samples) % len(ex["freqs"])]
            end = min(i + int(0.05 * sr), len(t))
            y[i:end] += np.sin(2 * np.pi * freq * t[i:end]) * 0.5
    else:
        y = generate_pulse(
            ex["freq"],
            ex["interval"],
            ex.get("pulse_duration", 0.05)
        )

    # Normalize
    y = y / np.max(np.abs(y))
    
    # Save audio to WAV file
    write(f"audio_wav/{ex['label'].replace(' ', '_').lower()}.wav", sr, (y * 32767).astype(np.int16))

    # Display
    # Beat detection with librosa
    tempo, beat_frames = librosa.beat.beat_track(y=y.astype(np.float32), sr=sr)
    beat_times = librosa.frames_to_time(beat_frames, sr=sr)

    # Display results
    print(f"🔊 {ex['label']}")
    print(f"🕺 Interval: {ex['interval']}, Pulse Duration: {ex['pulse_duration']}  --> Estimated Tempo: {float(tempo):.2f} BPM")  # Explicit cast to float
    #print("🟡 Beat times (s):", ", ".join(f"{bt:.2f}" for bt in beat_times))
    display(Audio(y, rate=sr))

In [None]:
import librosa
# === Beat detection ===
tempo, beats = librosa.beat.beat_track(y=y.astype(np.float32), sr=sr)

In [None]:
beats

In [None]:
# Beat detection with librosa
tempo, beat_frames = librosa.beat.beat_track(y=y.astype(np.float32), sr=sr)
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

# Display results
print(f"🔊 {ex['label']}")
print(f"🕺 Estimated Tempo: {float(tempo):.2f} BPM")  # Explicit cast to float
print("🟡 Beat times (s):", ", ".join(f"{bt:.2f}" for bt in beat_times))
display(Audio(y, rate=sr))