### Import Libs :

In [67]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, lead, avg
from dotenv import load_dotenv
import os
from pyspark.sql.window import Window

load_dotenv()

False

### Create Spark Session :

In [49]:
spark = (
    SparkSession.builder
    .appName("App")
    .config("spark.sql.parquet.int96RebaseModeInRead", "LEGACY")
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "LEGACY")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .getOrCreate()
)


### Load Data Brut :

In [50]:
def load_data():
    output_path = "../../Data/Normal/btc_minute_data.parquet"
    df = spark.read.parquet(output_path)
    return df

df = load_data()
df.count()

1000

### Save Bronze Data :

In [51]:
def Save_Bronze_Local(Data_i):
    Data_i.write.mode('overwrite').format("parquet").save("../../Data/Bronze/")
    
Save_Bronze_Local(df)

### Check Null Values :

In [52]:
def CheckNull(Data_B):
    num_rows = Data_B.count()       
    Columns_list = Data_B.columns

    for c in Columns_list:
        num_null = Data_B.filter(col(c).isNull()).count()
        if num_null > 0:
            null_percent = (num_null / num_rows) * 100
            print(f"Column {c} has {num_null} null values ({null_percent:.2f}%)")
            
            if null_percent < 5:
                Data_B = Data_B.na.drop(subset=[c])
            else:
                try:
                    mean_value = Data_B.select(mean(c)).collect()[0][0]
                    Data_B = Data_B.fillna({c: mean_value})
                except:
                    mode_value = Data_B.groupBy(c).count().orderBy(col("count").desc()).first()[0]
                    Data_B = Data_B.fillna({c: mode_value})
        else:
            print(f"{c} : you dont have any null values")
    return Data_B

CheckNull(df)

open_time : you dont have any null values
open : you dont have any null values
high : you dont have any null values
low : you dont have any null values
close : you dont have any null values
volume : you dont have any null values
close_time : you dont have any null values
quote_asset_volume : you dont have any null values
number_of_trades : you dont have any null values
taker_buy_base_volume : you dont have any null values
taker_buy_quote_volume : you dont have any null values
ignore : you dont have any null values


DataFrame[open_time: timestamp_ntz, open: double, high: double, low: double, close: double, volume: double, close_time: timestamp_ntz, quote_asset_volume: double, number_of_trades: bigint, taker_buy_base_volume: double, taker_buy_quote_volume: double, ignore: string]

### Check duplicated Values :

In [53]:
def CheckDuplicated(Data_B):
    num_rows = Data_B.count()
    num_rows_no_duplicate = Data_B.distinct().count()
    num_duplicate_values = num_rows - num_rows_no_duplicate
    if num_duplicate_values == 0:
        print("you don't have any duplicated values !!")
    else:
        Data_B = Data_B.distinct()
        return Data_B
    return Data_B

CheckDuplicated(df)

you don't have any duplicated values !!


DataFrame[open_time: timestamp_ntz, open: double, high: double, low: double, close: double, volume: double, close_time: timestamp_ntz, quote_asset_volume: double, number_of_trades: bigint, taker_buy_base_volume: double, taker_buy_quote_volume: double, ignore: string]

### Rmove ignore column 

In [54]:
def Ignore_Remover(Data_B):
    Data_B = Data_B.drop(col("ignore"))
    return Data_B

df = Ignore_Remover(df)
df.printSchema()

root
 |-- open_time: timestamp_ntz (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- close_time: timestamp_ntz (nullable = true)
 |-- quote_asset_volume: double (nullable = true)
 |-- number_of_trades: long (nullable = true)
 |-- taker_buy_base_volume: double (nullable = true)
 |-- taker_buy_quote_volume: double (nullable = true)



### Create Column close_t_plus_10 

In [55]:
def Create_Column_close_t_plus_10(Data_B):
    window = Window.orderBy("open_time")
    Data_B = Data_B.withColumn("close_t_plus_10", lead("close", 10).over(window))
    return Data_B

df = Create_Column_close_t_plus_10(df)

### Check Null After Add  close_t_plus_10 Column:

In [56]:
df = CheckNull(df)

open_time : you dont have any null values
open : you dont have any null values
high : you dont have any null values
low : you dont have any null values
close : you dont have any null values
volume : you dont have any null values
close_time : you dont have any null values
quote_asset_volume : you dont have any null values
number_of_trades : you dont have any null values
taker_buy_base_volume : you dont have any null values
taker_buy_quote_volume : you dont have any null values


26/01/19 16:17:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:17:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:17:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:17:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Column close_t_plus_10 has 10 null values (1.00%)


### Create Entry Variable :

In [60]:
def Entry_Creator(Data_B):
    window = Window.orderBy("open_time")
    Data_B = Data_B.withColumn("return", lead("close", 10).over(window))
    Data_B = df.withColumn("return", lead("close").over(window) - col("close"))
    return Data_B

df = Entry_Creator(df)
df.show()

+-------------------+--------+--------+--------+--------+-------+--------------------+------------------+----------------+---------------------+----------------------+---------------+--------------------+
|          open_time|    open|    high|     low|   close| volume|          close_time|quote_asset_volume|number_of_trades|taker_buy_base_volume|taker_buy_quote_volume|close_t_plus_10|              return|
+-------------------+--------+--------+--------+--------+-------+--------------------+------------------+----------------+---------------------+----------------------+---------------+--------------------+
|2026-01-18 21:39:00|95387.81|95387.81|95380.76|95380.76| 0.5782|2026-01-18 21:39:...|     55152.8057001|             711|              0.11958|         11406.4454078|       95424.79|0.010000000009313226|
|2026-01-18 21:40:00|95380.77|95380.77|95380.76|95380.77|0.44398|2026-01-18 21:40:...|     42347.1505478|             102|               0.0723|           6896.029671|       95412.

26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:38:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

### Check Null Values After Added return Column :

In [61]:
df = CheckNull(df)

26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

open_time : you dont have any null values
open : you dont have any null values


26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

high : you dont have any null values
low : you dont have any null values
close : you dont have any null values


26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

volume : you dont have any null values
close_time : you dont have any null values


26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

quote_asset_volume : you dont have any null values
number_of_trades : you dont have any null values


26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


taker_buy_base_volume : you dont have any null values
taker_buy_quote_volume : you dont have any null values


26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


close_t_plus_10 : you dont have any null values


26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 16:41:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/19 1

Column return has 1 null values (0.10%)


### Create Entry M_5 :

In [68]:
def M5_Creator(Data_B):
    window = Window.orderBy("open_time")
    Data_B = Data_B.withColumn("M_5", avg(lead("close", 5).over(window)))
    return Data_B

df = M5_Creator(df)


AnalysisException: It is not allowed to use a window function inside an aggregate function. Please use the inner window function in a sub-query.