In [0]:
from delta.tables import *
from pyspark.sql.types import*
from pyspark.sql.functions import* 
from pyspark.sql.window import Window

In [0]:
"""
Advanced Stock Market Data Analysis
Dataset:
Historical stock prices with:
stock_id, date, open_price, high_price, low_price, close_price, volume.

Tasks:
Compute daily price change percentage for each stock.
Identify stocks with the highest volatility (price fluctuations) in the last 30 days.
Find the top 5 stocks with the highest trading volume for each month.
Compute a 7-day moving average closing price for each stock."""

In [0]:
# Define schema
schema = StructType([
    StructField("stock_id", StringType(), False),
    StructField("date", StringType(), False),
    StructField("open_price", DoubleType(), False),
    StructField("high_price", DoubleType(), False),
    StructField("low_price", DoubleType(), False),
    StructField("close_price", DoubleType(), False),
    StructField("volume", IntegerType(), False)
])

# Sample data
data = [
    ("AAPL", "2024-03-01", 150.0, 155.0, 149.0, 152.0, 5000000),
    ("AAPL", "2024-03-02", 152.0, 158.0, 151.0, 157.0, 6000000),
    ("AAPL", "2024-03-03", 157.0, 160.0, 156.0, 159.0, 5500000),
    ("GOOGL", "2024-03-01", 2800.0, 2850.0, 2780.0, 2825.0, 3000000),
    ("GOOGL", "2024-03-02", 2825.0, 2900.0, 2810.0, 2880.0, 3200000),
    ("GOOGL", "2024-03-03", 2880.0, 2950.0, 2870.0, 2920.0, 3100000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

df = df.withColumn("date", to_date("date"))
# Show DataFrame
df.show()


In [0]:
class stockDataAnalysis:
    def read_data(self, path):
        try:
            df = spark.read.format('csv').option('header',True).load(path)
            return df
        except Exception as e:
            print(f"Error occured during file reading: {e}")
            return None

    # Compute daily price change percentage for each stock.    
    def dailyPriceChange(self, df, lag_key, partition_col, order_col):
        # window specification
        window_spec = Window.partitionBy(partition_col).orderBy(order_col)
        # Creating previous closing price column
        updated_df = df.withColumn("previous_day_price", lag(lag_key).over(window_spec))
        # Filtered null values
        updated_df = updated_df.filter(col("previous_day_price").isNotNull())
        # Calculating percentage 
        percentage_df = updated_df.withColumn(
            "percentage", ((col("closing_price") - col("previous_day_price")) / col("previous_day_price")) * 100)
        return percentage_df
    
    # Identify stocks with the highest volatility (price fluctuations) in the last 30 days.
    def stocksVolatility(self, df, lag_key, partition_col, order_col):
        # window specification
        window_spec = Window.partitionBy(partition_col).orderBy(order_col)
        # Filtering dataframe to get last 30 days data
        filtered_df = df.filter(col("date") > date_sub(current_date(),30))
        # Creating previous closing price column
        updated_df = filtered_df.withColumn("previous_day_price", lag(lag_key).over(window_spec))
        # Handling nulls using fillna function
        updated_df = updated_df.fillna(0)
        # Calculating percentage 
        percentage_df = updated_df.withColumn(
            "percentage", ((col("closing_price") - col("previous_day_price")) / col("previous_day_price")) * 100)
        # Aggregating standard deviation for each stock
        aggregated_df = percentage_df.groupBy("stock_id").agg(
            stddev("percentage").alias("volatility")
        )
        # window specification for ranking
        window_rank = Window.orderBy(desc("volatility"))
        # Ranking stocks based on this volatility
        ranking_df = aggregated_df.withColumn(
            "rank",
            dense_rank().over(window_rank)
        )
        return ranking_df
    
    # Find the top 5 stocks with the highest trading volume for each month.
    def highestTradingVolumes(self, df, groupBy_key, sum_key):
        # Extracting year and month from date
        df = df.withColumn("year_month",date_format(col("date"), "yyyy-MM"))
        # Aggregating total volumes for each stock
        aggregated_df = df.groupBy(groupBy_key, "year_month").agg(
            sum(sum_key).alias("total_volumes")
        )
        # window specification for ranking
        window = Window.partitionBy("year_month").orderBy(desc("total_volumes"))
        # Ranking stocks with total volumes
        ranking_df = aggregated_df.withColumn("rank", rank().over(window))
        # filtering top 3 stocks
        top5_stocks_df = ranking_df.filter(col("rank") <= 5)
        return top5_stocks_df
    
    # Compute a 7-day moving average closing price for each stock.
    def stockMovingAvg(self, df, partition_col, order_col, avg_key):
        # window specification for moving average
        window_avg = Window.partitionBy(partition_col).orderBy(order_col).rowsBetween(-6, 0)
        # Aggregating moving average for each stock
        moving_avg_df = df.withColumn(
            "moving_average",
            avg(avg_key).over(window_avg)
        )
        return moving_avg_df
    
# Instantiating the class     
stock_inst = stockDataAnalysis()

# Reading the file from path
stocks_df = stock_inst.read_data("/Volumes/rbc/rbcschema/raw/stocks")

# reading the methods 
percentage_df = stock_inst.dailyPriceChange(stocks_df, "closing_price", "stock_id", "date")

ranking_df = stock_inst.stocksVolatility(stocks_df, "closing_price", "stock_id", "date")

top5_stocks_df = stock_inst.highestTradingVolumes(stocks_df, "stock_id", "volume")

moving_avg_df = stock_inst.stockMovingAvg(stocks_df, "stock_id", "date", "closing_price")