<div style="border: 4px solid white; padding: 20px; background-color: #2596be; color: white;">

# <b>Explorative Datenanalyse mit Sparky</b>

#### <i>CAS Information Engineering - Modul: Big data - FS 2024</i>

<b> Autoren: </b> Hassler Robin, Tschanz Daniel, Tsiantas Theofanis (Gruppe 10)

</div>

# Teil 1 - Preprocessing

In [None]:
# Notwendige Packete
import sparky
import pyspark
import pyspark.sql
from pyspark.sql.functions import trim, col, to_date, when
import time

In [None]:
# Accountdefinition für die Verbindung mit sparky
zhawaccount = "tsianthe"

In [None]:
sc = sparky.connect(f"sparknotebook-{zhawaccount}", 2)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

## Dateien einlesen

In [None]:
# CSV-Dateien einlesen
df_customer = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("Customermaster.csv")
df_items = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("Itemmaster.csv")
df_orders = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("Orderlines.csv")
df_currency = spark.read.format("csv").option("header", "true").option("delimiter", ",").load("ExchangeRates.csv")

In [None]:
# Inhalt CSV-Dateien überprüfen
display(df_customer.toPandas().head(2))
display(df_items.toPandas().head(2))
display(df_orders.toPandas().head(2))
display(df_currency.toPandas().head(2))

## Data cleansing

Nur die relevanten Informationen der Dateien beibehalten.

In [None]:
# Kunden
df_customer = df_customer.withColumn("Status", col("Status").cast("integer"))
df_customer = df_customer.withColumn("Potential", col("Potential").cast("float"))
df_customer = df_customer.withColumn("Entry Date", to_date(col("Entry Date"), "yyyyMMdd"))
df_customer = df_customer.drop("CGC", "Industry", "Potential", "DiscountModel", "PriceList", "TargetFlag", "IndirectFlag", "Oldcustomernr", "Newcustomernr", "Incoterms", "CRMflag", "CRMID", "SPC", "EntryDate")
display(df_customer.toPandas().head(2))

In [None]:
# Objekte
df_items = df_items.withColumn("Item number", trim(df_items["Item number"]))
df_items = df_items.withColumn("Itemgroup", trim(df_items["Itemgroup"]))
df_items = df_items.withColumn("Itemgroup", col("Itemgroup").cast("integer"))
df_items = df_items.withColumn("Productgroup", trim(df_items["Productgroup"]))
df_items = df_items.withColumn("Itemtype", trim(df_items["Itemtype"]))
df_items = df_items.drop("EntryDate", "Responsible")
display(df_items.toPandas().head(2))

In [None]:
# Bestellungen
df_orders = df_orders.withColumn("Customernumber", trim(df_orders["Customernumber"]))
df_orders = df_orders.withColumn("Item number", trim(df_orders["Item number"]))
df_orders = df_orders.withColumn("Status", col("Status").cast("integer"))
df_orders = df_orders.withColumn("Net price", col("Net price").cast("float"))

# Menge als int lesen
quantity_columns = [
    "Ordered quantity", "Confirmed quantity", "Remaining quantity", 
    "Allocated quantity", "Picking list quantity", 
    "Delivered quantity", "Invoiced quantity"
]
for column in quantity_columns:
    df_orders = df_orders.withColumn(column, col(column).cast("integer"))
 
# Datenformat korrigieren
date_columns = [
    "Requested delivery date", "Confirmed delivery date", 
    "Departure date", "Planning date", "Registration date"
]
for column in date_columns:
    df_orders = df_orders.withColumn(column, to_date(col(column), "yyyyMMdd"))

# Der Datensatz beinhaltet einige ungültige Daten vor Mai 2019. Diese müssen gelöscht werden.
df_orders = df_orders.filter(col("Registration date") > "2019-04-30")
    
display(df_orders.toPandas().tail(2))

In [None]:
# Währungen
df_currency = df_currency.withColumn("LocalCurrency", trim(df_currency["LocalCurrency"]))
df_currency = df_currency.withColumn("ForeignCurrency", trim(df_currency["ForeignCurrency"]))
df_currency = df_currency.withColumn("ExchangeRate", col("ExchangeRate").cast("float"))
df_currency = df_currency.withColumn("ActiveDate", to_date(col("ActiveDate"), "yyyyMMdd"))
display(df_currency.toPandas().head(2))

## Abspeichern in CSV und Parquet

In [None]:
# CSV
save_time_start = time.time()
df_customer.write.mode("overwrite").csv("./cleanedData/Customers.csv")
df_items.write.mode("overwrite").csv("./cleanedData/Items.csv")
df_orders.write.mode("overwrite").csv("./cleanedData/Orders.csv")
df_currency.write.mode("overwrite").csv("./cleanedData/Exchangerates.csv")
save_time_finish = time.time()
print(f"Time taken to save: {save_time_finish - save_time_start:.2f} seconds")

In [None]:
# Parquet
save_time_start = time.time()
df_customer.write.mode("overwrite").parquet("./cleanedData/Customers.parquet")
df_items.write.mode("overwrite").parquet("./cleanedData/Items.parquet")
df_orders.write.mode("overwrite").parquet("./cleanedData/Orders.parquet")
df_currency.write.mode("overwrite").parquet("./cleanedData/Exchangerates.parquet")
save_time_finish = time.time()
print(f"Time taken to save: {save_time_finish - save_time_start:.2f} seconds")

In [None]:
sc.stop()