In [81]:
from IPython.display import display, HTML
import pyspark
from pyspark.sql import SparkSession, SQLContext, Row, DataFrame
display(HTML('<style>pre { white-space: pre !important; }</style>'))
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek, from_unixtime, to_timestamp, to_date, unix_timestamp, col, udf, lit, stddev, format_number, mean
from pyspark.sql import functions as F

In [38]:
spark : SparkSession = SparkSession.builder.appName("my_app").getOrCreate()

In [39]:
STOCKS = "../data/stocks.csv"

In [40]:
df : DataFrame = SparkSession.builder.getOrCreate().read.csv(STOCKS, header=True, inferSchema=True)

In [41]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [42]:
selected_df = df.filter(df["Close"] < 500).select("Open", "Close", "Volume")
selected_df.show()

+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        213.429998|        214.009998|123432400|
|        214.599998|        214.379993|150476200|
|        214.379993|        210.969995|138040000|
|            211.75|            210.58|119282800|
|        210.299994|211.98000499999998|111902700|
|212.79999700000002|210.11000299999998|115557400|
|209.18999499999998|        207.720001|148614900|
|        207.870005|        210.650002|151473000|
|210.11000299999998|            209.43|108223500|
|210.92999500000002|            205.93|148516900|
|        208.330002|        215.039995|182501900|
|        214.910006|            211.73|153038200|
|        212.079994|        208.069996|152038600|
|206.78000600000001|            197.75|220441900|
|202.51000200000001|        203.070002|266424900|
|205.95000100000001|        205.940001|466777500|
|        206.849995|        207.880005|430642100|


In [43]:
df.createOrReplaceTempView("stocks")

selected_df = spark.sql("SELECT Open, Close, Volume FROM stocks WHERE Close < 500")

selected_df.show()

+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        213.429998|        214.009998|123432400|
|        214.599998|        214.379993|150476200|
|        214.379993|        210.969995|138040000|
|            211.75|            210.58|119282800|
|        210.299994|211.98000499999998|111902700|
|212.79999700000002|210.11000299999998|115557400|
|209.18999499999998|        207.720001|148614900|
|        207.870005|        210.650002|151473000|
|210.11000299999998|            209.43|108223500|
|210.92999500000002|            205.93|148516900|
|        208.330002|        215.039995|182501900|
|        214.910006|            211.73|153038200|
|        212.079994|        208.069996|152038600|
|206.78000600000001|            197.75|220441900|
|202.51000200000001|        203.070002|266424900|
|205.95000100000001|        205.940001|466777500|
|        206.849995|        207.880005|430642100|


In [44]:
selected_df = df[(df["Open"] > 200) & (df["Close"] < 200)]
selected_df.show()

+----------+------------------+----------+----------+----------+---------+------------------+
|      Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+----------+------------------+----------+----------+----------+---------+------------------+



In [45]:
selected_df = spark.sql("SELECT * FROM stocks WHERE Open > 200 AND Close < 200")

selected_df.show()

+----------+------------------+----------+----------+----------+---------+------------------+
|      Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+----------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+----------+------------------+----------+----------+----------+---------+------------------+



In [46]:
df_year = df.withColumn("Year", year(df["Date"]))

df_year.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|
+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|
|2010-01-11|212.

In [55]:
df_year = spark.sql("SELECT *, YEAR(Date) as Year FROM stocks")

df_year.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|
+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|
|2010-01-11|212.

In [65]:
df_min_volume = df_year.groupBy("Year").agg(F.min("Volume").alias("minVolume"))

df_min_volume.show()

+----+---------+
|Year|minVolume|
+----+---------+
|2015| 13046400|
|2013| 41888700|
|2014| 14479600|
|2012| 43938300|
|2016| 11475900|
|2010| 39373600|
|2011| 44915500|
+----+---------+



In [66]:
df_year.createOrReplaceTempView("stocks")

df_min_volume = spark.sql("SELECT Year, MIN(Volume) as minVolume FROM stocks GROUP BY Year")

df_min_volume.show()

+----+---------+
|Year|minVolume|
+----+---------+
|2015| 13046400|
|2013| 41888700|
|2014| 14479600|
|2012| 43938300|
|2016| 11475900|
|2010| 39373600|
|2011| 44915500|
+----+---------+



In [67]:
df_month = df.withColumn("Month", month(df["Date"]))

df_month.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+-----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Month|
+----------+------------------+------------------+------------------+------------------+---------+------------------+-----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|    1|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|    1|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|    1|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|    1|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|    1|
|2010-01

In [68]:
df_month = spark.sql("SELECT *, MONTH(Date) as Month FROM stocks")

df_month.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+----+-----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|Month|
+----------+------------------+------------------+------------------+------------------+---------+------------------+----+-----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|    1|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|    1|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|    1|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|    1|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700

In [69]:
df_max_low = df_month.groupBy("Month").agg(F.max("Low").alias("maxLow"))

df_max_low.show()

+-----+-----------------+
|Month|           maxLow|
+-----+-----------------+
|   12|       585.500023|
|    1|       552.020004|
|    6|       644.470024|
|    3|610.3099900000001|
|    5|       628.900002|
|    9|       699.569977|
|    4|626.0000150000001|
|    8|673.5400089999999|
|    7|605.9999849999999|
|   10|       665.550026|
|   11|       594.170021|
|    2|545.6099780000001|
+-----+-----------------+



In [71]:
df_month.createOrReplaceTempView("stocks")

df_max_low = spark.sql("SELECT Month, MAX(Low) as maxLow FROM stocks GROUP BY Month")

df_max_low.show()

+-----+-----------------+
|Month|           maxLow|
+-----+-----------------+
|   12|       585.500023|
|    1|       552.020004|
|    6|       644.470024|
|    3|610.3099900000001|
|    5|       628.900002|
|    9|       699.569977|
|    4|626.0000150000001|
|    8|673.5400089999999|
|    7|605.9999849999999|
|   10|       665.550026|
|   11|       594.170021|
|    2|545.6099780000001|
+-----+-----------------+



In [72]:
df_month_year = df.withColumn("Month", month(df["Date"])).withColumn("Year", year(df["Date"]))

In [73]:
df_max_low = df_month_year.groupBy("Year", "Month").agg(F.max("Low").alias("maxLow"))

df_max_low.show()

+----+-----+------------------+
|Year|Month|            maxLow|
+----+-----+------------------+
|2012|   10|        665.550026|
|2010|    7|        260.300003|
|2010|   12|        325.099991|
|2015|    2|        131.169998|
|2014|    4|        589.799988|
|2015|   12|        117.809998|
|2016|    7|            103.68|
|2016|   11|        111.400002|
|2012|    8| 673.5400089999999|
|2013|    2|473.24997699999994|
|2012|    4| 626.0000150000001|
|2012|   12|        585.500023|
|2014|   10|        107.209999|
|2016|    5|             99.25|
|2014|   12|        115.290001|
|2013|    9|        503.479988|
|2013|   10|        525.110016|
|2014|    5|        628.900002|
|2016|    2|         96.650002|
|2013|   12| 566.4100269999999|
+----+-----+------------------+
only showing top 20 rows



In [74]:
df_month_year.createOrReplaceTempView("stocks")

df_max_low = spark.sql("SELECT Year, Month, MAX(Low) as maxLow FROM stocks GROUP BY Year, Month")

df_max_low.show()

+----+-----+------------------+
|Year|Month|            maxLow|
+----+-----+------------------+
|2012|   10|        665.550026|
|2010|    7|        260.300003|
|2010|   12|        325.099991|
|2015|    2|        131.169998|
|2014|    4|        589.799988|
|2015|   12|        117.809998|
|2016|    7|            103.68|
|2016|   11|        111.400002|
|2012|    8| 673.5400089999999|
|2013|    2|473.24997699999994|
|2012|    4| 626.0000150000001|
|2012|   12|        585.500023|
|2014|   10|        107.209999|
|2016|    5|             99.25|
|2014|   12|        115.290001|
|2013|    9|        503.479988|
|2013|   10|        525.110016|
|2014|    5|        628.900002|
|2016|    2|         96.650002|
|2013|   12| 566.4100269999999|
+----+-----+------------------+
only showing top 20 rows



In [82]:
df_stats = df.select(format_number(mean("High"), 2).alias("Mean_High"), 
                     format_number(stddev("High"), 2).alias("StdDev_High"))

df_stats.show()

+---------+-----------+
|Mean_High|StdDev_High|
+---------+-----------+
|   315.91|     186.90|
+---------+-----------+



In [80]:
result = spark.sql("SELECT FORMAT_NUMBER(AVG(High), 2) as Mean_High, FORMAT_NUMBER(STDDEV(High), 2) as StdDev_High FROM stocks")
mean_high = result.first()["Mean_High"]
stddev_high = result.first()["StdDev_High"]

print(f"Mean of High Price: {mean_high}")
print(f"Standard Deviation of High Price: {stddev_high}")

Mean of High Price: 315.91
Standard Deviation of High Price: 186.90
