# Docker 

In [2]:
# docker run -it --name pyspark-jupy --mount type=bind,source=C:/Users/soumy/OneDrive/Coding,target=/app/data --rm -p 8888:8888 quay.io/jupyter/pyspark-notebook

# Spark

## Spark Context

In [3]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
print(sc.version)

3.5.3


## Spark Session

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("demo").getOrCreate()

# Library

In [5]:
from pyspark.sql.functions import col, median, max, min

# Import Data

In [6]:
import string
from pyspark import SparkFiles
url_df = "https://raw.githubusercontent.com/plotly/datasets/refs/heads/master/all_stocks_5yr.csv"
sc.addFile(url_df)
filePath  = 'file://' + SparkFiles.get('all_stocks_5yr.csv')
all_stocks_5yr = spark.read.csv(filePath, header=True, inferSchema= True).repartition(10)
all_stocks_5yr = all_stocks_5yr.orderBy("Name", "date")
all_stocks_5yr.show(3)

+----------+-----+-----+-----+-----+-------+----+
|      date| open| high|  low|close| volume|Name|
+----------+-----+-----+-----+-----+-------+----+
|2013-02-08|45.07|45.35| 45.0|45.08|1824755|   A|
|2013-02-11|45.17|45.18|44.45| 44.6|2915405|   A|
|2013-02-12|44.81|44.95| 44.5|44.62|2373731|   A|
+----------+-----+-----+-----+-----+-------+----+
only showing top 3 rows



In [7]:
all_stocks_5yr.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: integer (nullable = true)
 |-- Name: string (nullable = true)



# Data Summarise

In [8]:
df_summaise = all_stocks_5yr.drop('Name')
df_summaise = df_summaise.\
    groupBy('date').\
    agg(
        median("open").alias("open"), 
        median("close").alias("close"), 
        max("high").alias("high"), 
        min("low").alias("low"),
        median("volume").alias("volume")
    ).orderBy("date").\
    repartition(10)
df_summaise.show(5)

+----------+------------------+------------------+-------+----+---------+
|      date|              open|             close|   high| low|   volume|
+----------+------------------+------------------+-------+----+---------+
|2014-03-21|             59.16|             58.23|1316.99|4.01|3666543.0|
|2016-02-01|             60.07|60.495000000000005|1094.79|2.11|3034645.5|
|2017-11-17| 75.00999999999999| 74.75999999999999| 1749.6|3.89|1978398.5|
|2014-04-02|            58.955|            58.965|1276.74|4.06|2060845.5|
|2013-10-18|54.040000000000006|             53.94|1056.02|3.51|2277441.0|
+----------+------------------+------------------+-------+----+---------+
only showing top 5 rows

