<a href="https://colab.research.google.com/github/RajaSuhashKesari/MyDataEngineeringPractices/blob/main/Pyspark%20Programs/SparkDataFramesandSQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
from pyspark.sql import SparkSession

In [40]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType

In [41]:
spark = SparkSession.Builder().appName("NYSE").getOrCreate()

In [42]:
spark.sparkContext.textFile("/content/NYSE.csv").take(5)

['NYSE,AEA,2010-02-08,4.42,4.42,4.21,4.24,205500,4.24',
 'NYSE,AEA,2010-02-05,4.42,4.54,4.22,4.41,194300,4.41',
 'NYSE,AEA,2010-02-04,4.55,4.69,4.39,4.42,233800,4.42',
 'NYSE,AEA,2010-02-03,4.65,4.69,4.50,4.55,182100,4.55',
 'NYSE,AEA,2010-02-02,4.74,5.00,4.62,4.66,222700,4.66']

In [43]:
schema_nyse = StructType().add("exc_name",StringType(),True).add("stock_id",StringType(),True).add("stock_date",StringType(),True).add("open_price",DoubleType(),True).add("high_price",DoubleType(),True).add("low_price",DoubleType(),True).add("close_price",DoubleType(),True).add("volume",LongType(),True).add("adj_close",DoubleType(),True)

In [44]:
df_with_schema = spark.read.format("csv").option("header","False").option("delimiter",",").schema(schema_nyse).load("/content/NYSE.csv")

In [45]:
df_with_schema.printSchema()

root
 |-- exc_name: string (nullable = true)
 |-- stock_id: string (nullable = true)
 |-- stock_date: string (nullable = true)
 |-- open_price: double (nullable = true)
 |-- high_price: double (nullable = true)
 |-- low_price: double (nullable = true)
 |-- close_price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- adj_close: double (nullable = true)



In [46]:
df_with_schema.show(10)

+--------+--------+----------+----------+----------+---------+-----------+------+---------+
|exc_name|stock_id|stock_date|open_price|high_price|low_price|close_price|volume|adj_close|
+--------+--------+----------+----------+----------+---------+-----------+------+---------+
|    NYSE|     AEA|2010-02-08|      4.42|      4.42|     4.21|       4.24|205500|     4.24|
|    NYSE|     AEA|2010-02-05|      4.42|      4.54|     4.22|       4.41|194300|     4.41|
|    NYSE|     AEA|2010-02-04|      4.55|      4.69|     4.39|       4.42|233800|     4.42|
|    NYSE|     AEA|2010-02-03|      4.65|      4.69|      4.5|       4.55|182100|     4.55|
|    NYSE|     AEA|2010-02-02|      4.74|       5.0|     4.62|       4.66|222700|     4.66|
|    NYSE|     AEA|2010-02-01|      4.84|      4.92|     4.68|       4.75|194800|     4.75|
|    NYSE|     AEA|2010-01-29|      4.97|      5.05|     4.76|       4.83|222900|     4.83|
|    NYSE|     AEA|2010-01-28|      5.12|      5.22|     4.81|       4.98|283100

In [47]:
df_with_schema.createOrReplaceTempView("nyse")

In [48]:
StockVolume = spark.sql("select stock_id, sum(volume) as total_volume from nyse group by stock_id order by total_volume desc")

In [49]:
StockVolume.show(10)

+--------+------------+
|stock_id|total_volume|
+--------+------------+
|     AMD| 47252808500|
|      AA| 42061448400|
|     AXP| 40263020300|
|     AVP| 32074528500|
|     AET| 30218027200|
|     ABT| 25664130200|
|     AMR| 22505621700|
|     ABX| 16691172100|
|     APC| 15555731900|
|     ADM| 15354593500|
+--------+------------+
only showing top 10 rows



In [51]:
StockVolume.write.csv("/content/StockVolumes")