In [None]:
# Import
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Create SparkSession
spark = SparkSession.builder.appName("Local").master("local[*]").getOrCreate()

In [None]:
# Read CSV File
df = (
    spark.read.option("inferSchema", True)
    .option("delimiter", ",")
    .option("header", True)
    .csv("DATA/GOLD/Holding/Holding_data.csv")
)
for each in df.schema:
    print(each.name, str(each.dataType).replace("Type()", ""))

In [None]:
scrip_window = Window.partitionBy(["username", "symbol"])

In [None]:
df_max_price_diff = (
    df.withColumn("price_diff", F.round(F.col("high_price") - F.col("avg_price"), 2))
    .withColumn(
        "rank_price_diff",
        F.row_number().over(scrip_window.orderBy(F.col("price_diff").desc())),
    )
    .filter(F.col("rank_price_diff") == 1)
    .select(
        "username",
        "symbol",
        "date",
        "avg_price",
        "high_price",
        "price_diff",
    )
    .orderBy(F.col("price_diff").desc())
)
df_max_price_diff.write.partitionBy("username").option("header", True).mode(
    "overwrite"
).csv("output/price_diff")

In [None]:
df_max_profit = (
    df.withColumn("profit", F.round(F.col("high_amount") - F.col("holding_amount"), 2))
    .withColumn(
        "rank_profit", F.row_number().over(scrip_window.orderBy(F.col("profit").desc()))
    )
    .filter(F.col("rank_profit") == 1)
    .select(
        "username",
        "symbol",
        "date",
        "holding_amount",
        "high_amount",
        "profit",
    )
    .orderBy(F.col("profit").desc())
)
df_max_profit.write.partitionBy("username").option("header", True).mode(
    "overwrite"
).csv("output/profit")