In [1]:
# 1. Start a simple Spark session
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init() #instanciar spark session. Es conveniente tener una unica sesion iniciada y actualizada (si existe la utiliza y si no la crea).
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate() #usamos todos los procesadores locales: local[*]

In [2]:
# 1. Start a simple Spark session
spark

In [5]:
# 2. Load the Walmart Stock CSV file, let Spark infer the data types
df = spark.read.csv("./walmart_stock.csv").show() 

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|       _c0|               _c1|               _c2|               _c3|               _c4|     _c5|               _c6|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.91

In [6]:
# 3. Show the column names
df = spark.read.options(header=True).csv("./walmart_stock.csv") 
df.show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000000004|
|2012-01-10|             59.43|59.709998999999996|             5

In [7]:
# 4. What does the Schema look like?
spark.read.options(header=True).csv("./walmart_stock.csv").printSchema() 
spark.read.options(header=True, inferSchema=True).csv("./walmart_stock.csv").printSchema() #cuando imprimimos el esquema sale que es todo string, con la opcion inferSchema=True salen los tipos de verdad

root
 |-- Date: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [8]:
# 5. Print out the first 5 rows
df.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



In [9]:
# 6. Use describe() to learn about the DataFrame
df.describe()

DataFrame[summary: string, Date: string, Open: string, High: string, Low: string, Close: string, Volume: string, Adj Close: string]

In [10]:
# 7. Format the numbers to show only 2 decimal places
from pyspark.sql.functions import round, col
data2 = df.withColumn("Open", round("Open", 2)).withColumn("High", round("High", 2)).withColumn("Low", round("Low", 2)).withColumn("Close", round("Close", 2)).withColumn("Adj Close", round("Adj Close", 2))
data2.show()

+----------+-----+-----+-----+-----+--------+---------+
|      Date| Open| High|  Low|Close|  Volume|Adj Close|
+----------+-----+-----+-----+-----+--------+---------+
|2012-01-03|59.97|61.06|59.87|60.33|12668800|    52.62|
|2012-01-04|60.21|60.35|59.47|59.71| 9593300|    52.08|
|2012-01-05|59.35|59.62|58.37|59.42|12768200|    51.83|
|2012-01-06|59.42|59.45|58.87| 59.0| 8069400|    51.46|
|2012-01-09|59.03|59.55|58.92|59.18| 6679300|    51.62|
|2012-01-10|59.43|59.71|58.98|59.04| 6907300|    51.49|
|2012-01-11|59.06|59.53|59.04| 59.4| 6365600|    51.81|
|2012-01-12|59.79| 60.0| 59.4| 59.5| 7236400|     51.9|
|2012-01-13|59.18|59.61|59.01|59.54| 7729300|    51.93|
|2012-01-17|59.87|60.11|59.52|59.85| 8500000|     52.2|
|2012-01-18|59.79|60.03|59.65|60.01| 5911400|    52.34|
|2012-01-19|59.93|60.73|59.75|60.61| 9234600|    52.86|
|2012-01-20|60.75|61.25|60.67|61.01|10378800|    53.21|
|2012-01-23|60.81|60.98|60.51|60.91| 7134100|    53.13|
|2012-01-24|60.75| 62.0|60.75|61.39| 7362800|   

In [11]:
# 8. Create a new DataFrame with a column called  'HV Ratio' that is the ratio of the High Price vs Volume of Stock traded for a day
data3 = data2.withColumn("HV Ratio", data2["High"]/data2["Volume"]).withColumn("HV Ratio", round("HV Ratio", 8))
data3.show()

+----------+-----+-----+-----+-----+--------+---------+--------+
|      Date| Open| High|  Low|Close|  Volume|Adj Close|HV Ratio|
+----------+-----+-----+-----+-----+--------+---------+--------+
|2012-01-03|59.97|61.06|59.87|60.33|12668800|    52.62| 4.82E-6|
|2012-01-04|60.21|60.35|59.47|59.71| 9593300|    52.08| 6.29E-6|
|2012-01-05|59.35|59.62|58.37|59.42|12768200|    51.83| 4.67E-6|
|2012-01-06|59.42|59.45|58.87| 59.0| 8069400|    51.46| 7.37E-6|
|2012-01-09|59.03|59.55|58.92|59.18| 6679300|    51.62| 8.92E-6|
|2012-01-10|59.43|59.71|58.98|59.04| 6907300|    51.49| 8.64E-6|
|2012-01-11|59.06|59.53|59.04| 59.4| 6365600|    51.81| 9.35E-6|
|2012-01-12|59.79| 60.0| 59.4| 59.5| 7236400|     51.9| 8.29E-6|
|2012-01-13|59.18|59.61|59.01|59.54| 7729300|    51.93| 7.71E-6|
|2012-01-17|59.87|60.11|59.52|59.85| 8500000|     52.2| 7.07E-6|
|2012-01-18|59.79|60.03|59.65|60.01| 5911400|    52.34|1.015E-5|
|2012-01-19|59.93|60.73|59.75|60.61| 9234600|    52.86| 6.58E-6|
|2012-01-20|60.75|61.25|6

In [12]:
# 9. What day had the Peak High in Price?
data2.groupBy("Date").agg({"High":"max"}).withColumnRenamed("max(High)", "MaxHigh").orderBy("MaxHigh", ascending=False).show(1)

+----------+-------+
|      Date|MaxHigh|
+----------+-------+
|2015-01-13|  90.97|
+----------+-------+
only showing top 1 row



In [13]:
# 10. What is tithe mean of the Close column
from pyspark.sql.functions import mean
data2.select(mean(data2["Close"])).withColumnRenamed("avg(Close)","Close Mean").withColumn("Close Mean", round("Close Mean", 2)).show()

+----------+
|Close Mean|
+----------+
|     72.39|
+----------+



In [14]:
# 11. What is the max and min of the Volume column?
from pyspark.sql.functions import max, min
data2.select(max(data2["Volume"])).withColumnRenamed("max(Volume)","Max Volume").show()
data2.select(min(data2["Volume"])).withColumnRenamed("min(Volume)","Min Volume").show()

#With SQL
DataTable = data2.registerTempTable("DataTable") 
spark.sql("select Max(Volume) from DataTable").withColumnRenamed("max(Volume)","Max Volume").show()
spark.sql("select Min(Volume) from DataTable").withColumnRenamed("min(Volume)","Min Volume").show()

+----------+
|Max Volume|
+----------+
|   9994400|
+----------+

+----------+
|Min Volume|
+----------+
|  10010500|
+----------+

+----------+
|Max Volume|
+----------+
|   9994400|
+----------+

+----------+
|Min Volume|
+----------+
|  10010500|
+----------+



In [15]:
# 12. How many days was the Close lower than 60 dollars?
spark.sql("select Close from DataTable where Close < 60").count()

81

In [16]:
# 13. What percentage of time was the High greater than 80 dollars?
(spark.sql("select High from DataTable where High > 80").count()/data2.count())*100

9.141494435612083

In [30]:
# 14. What is the Pearson correlation between High and Volume?
import pyspark.sql.functions as func

data3=data2.withColumn("Volume", func.round(data2["Volume"]).cast('integer'))
data4=data3.withColumn("High", func.round(data2["High"]).cast('integer'))

df2.stat.corr("Volume", "High")

-0.3392198653153861

In [31]:
# 15. What is the max High per year?
spark.sql("select Max(High), YEAR(Date) from DataTable group by YEAR(Date)").withColumnRenamed("year(CAST(Date AS DATE))","Year").withColumnRenamed("max(High)","Max High").orderBy("Year").show()

+--------+----+
|Max High|Year|
+--------+----+
|    77.6|2012|
|   81.37|2013|
|   88.09|2014|
|   90.97|2015|
|   75.19|2016|
+--------+----+



In [32]:
# 16. What is the average Close for each calendar month?
from pyspark.sql.functions import month 
data2.groupBy(month(data2.Date)).agg({"Close":"mean"}).withColumnRenamed("avg(Close)", "Close Mean").withColumnRenamed("month(Date)", "Month").orderBy("Month").withColumn("Close Mean", round("Close Mean", 2)).show()

#With SQL
spark.sql("select  MONTH(Date), avg(Close) from DataTable group by MONTH(Date)").withColumnRenamed("month(CAST(Date AS DATE))","Month").withColumnRenamed("avg(Close)","Close Mean").orderBy("Month").withColumn("Close Mean", round("Close Mean", 2)).show()


+-----+----------+
|Month|Close Mean|
+-----+----------+
|    1|     71.45|
|    2|     71.31|
|    3|     71.78|
|    4|     72.97|
|    5|     72.31|
|    6|      72.5|
|    7|     74.44|
|    8|     73.03|
|    9|     72.18|
|   10|     71.58|
|   11|     72.11|
|   12|     72.85|
+-----+----------+

+-----+----------+
|Month|Close Mean|
+-----+----------+
|    1|     71.45|
|    2|     71.31|
|    3|     71.78|
|    4|     72.97|
|    5|     72.31|
|    6|      72.5|
|    7|     74.44|
|    8|     73.03|
|    9|     72.18|
|   10|     71.58|
|   11|     72.11|
|   12|     72.85|
+-----+----------+

