<div style="border: 4px solid white; padding: 20px; background-color: #2596be; color: white;">

# <b>Explorative Datenanalyse mit Sparky</b>

#### <i>CAS Information Engineering - Modul: Big data - FS 2024</i>

<b> Autoren: </b> Hassler Robin, Tschanz Daniel, Tsiantas Theofanis (Gruppe 10)

</div>

# Teil 2 - Map/Reduce Anylse

In [None]:
# Installation der notwendigen Bibliothecken
%pip install seaborn

In [None]:
# Notwendige Packete
import sparky
import pyspark
import pyspark.sql
from pyspark.sql.functions import trim, col, to_date, when
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import time

In [None]:
# Accountdefinition für die Verbindung mit sparky
zhawaccount = "tsianthe"

In [None]:
sc = sparky.connect(f"sparknotebook-{zhawaccount}", 4)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## Dateien einlesen

In [None]:
# CSV-Dateien einlesen
load_time_start = time.time()
df_customer = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("./cleanedData/Customers.csv")
df_items = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("./cleanedData/Items.csv")
df_orders = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("./cleanedData/Orders.csv")
df_currency = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("./cleanedData/Exchangerates.csv")
load_time_finish = time.time()
print(f"Time taken to save: {load_time_finish - load_time_start:.2f} seconds")

In [None]:
# Parquet-Dateien einlesen (zur Vergleich der Ladenzeit)
load_time_start = time.time()
df_customer_p = spark.read.parquet("./cleanedData/Customers.parquet", header=True, inferSchema=True)
df_items_p = spark.read.parquet("./cleanedData/Items.parquet", header=True, inferSchema=True)
df_orders_p = spark.read.parquet("./cleanedData/Orders.parquet", header=True, inferSchema=True)
df_currency_p = spark.read.parquet("./cleanedData/Exchangerates.parquet", header=True, inferSchema=True)
load_time_finish = time.time()
print(f"Time taken to save: {load_time_finish - load_time_start:.2f} seconds")

## Map/Reduce

### Monatlich neu akquirierte Kunden

In [None]:
# Das Schema überprüfen
df_customer_p.printSchema()

In [None]:
# Berechnung der neu akquierten Kunden pro Jahrestag
rdd_customer = df_customer_p.rdd.map(lambda x:x[3]).map(lambda x:[x, 1]).reduceByKey(lambda x,y:x+y).sortByKey()
rdd_customer.collect()

In [None]:
# RDD nur auf Monatbasis mappen
rdd_customer_month = rdd_customer.map(lambda x: ((x[0].year, x[0].month), x[1]))
rdd_customer_month.collect()

In [None]:
# Die gleichen Monaten desselben Jahres addieren
rdd_aggregated = rdd_customer_month.reduceByKey(lambda a, b: a + b).sortByKey()
rdd_aggregated.collect()

In [None]:
# Variablen für Plot definieren
dates = [row[0] for row in rdd_aggregated.collect()]
events = [row[1] for row in rdd_aggregated.collect()]

In [None]:
# x-Achse als String definieren
x_labels = [f"{year}-{month:02d}" for year, month in dates]

In [None]:
# Plot erstellen und konfigurieren
plt.figure(figsize=(10, 6))
sns.lineplot(x=x_labels, y=events, sort=False)

plt.title('Neue Kunden pro Monat')
plt.xlabel('Jahr-Monat')
plt.ylabel('Anzahl neuer Kunden')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()

# Abstand der x-Achsenbeschriftungen definieren
plt.xticks(ticks=range(0, 220, 20), rotation=45)

plt.show()

## ??

In [None]:
sc.stop()