# Spark with Scala
Apache Toree - Jupyter Notebook testing

In [1]:
sc.parallelize(1 to 100).map(x => x*x).take(10)

Array(1, 4, 9, 16, 25, 36, 49, 64, 81, 100)

In [2]:
sc.parallelize(1 to 100).
  filter(x => x % 2 == 0).
  map(x => x * x).
  take(10)

Array(4, 16, 36, 64, 100, 144, 196, 256, 324, 400)

## Word Counter

In [3]:
val lines = Array("hello world", "hello one", "hello two")
sc.parallelize(lines).
  flatMap(line => line.split("\\W+")).
  map(word => (word,1)).
  reduceByKey(_+_).
  sortBy({case (word,count) => count},false).
  collect.
  mkString("\n")

(hello,3)
(two,1)
(one,1)
(world,1)

## Test CSV

In [4]:
// Grab URL contents
def getUrl(url:String):String = 
  scala.io.Source.fromURL(url).mkString

// Write file
def fileWrite(path:String,contents:String) = {
  import java.io.{PrintWriter,File}
  val writer = new PrintWriter(new File(path))
  writer.write(contents)
  writer.close
}

## Download Prices
Get the historical stock price of AAPL and save it in AAPL.csv

In [5]:
val symbol = "AAPL"
val baseUrl = "http://real-chart.finance.yahoo.com"
val url = s"${baseUrl}/table.csv?s=${symbol}&g=d&ignore=.csv"
val csv = getUrl(url)
val csvFile = s"${symbol}.csv"
fileWrite(csvFile, csv)
println(csvFile)

AAPL.csv


## Highest Prices
Find the days with the highest adjusted close prices.

In [6]:
val stockRdd = sc.textFile(csvFile).
  filter(line => line matches ".*\\d.*").
  map(line => line.split(",")).
  map(fields => (fields(6).toDouble,fields(0))).
  sortBy({case (close,date) => close},false)

stockRdd.take(5).foreach(println)

(141.460007,2017-03-20)
(141.419998,2017-03-22)
(140.919998,2017-03-23)
(140.690002,2017-03-16)
(140.639999,2017-03-24)


## Load CSV
Now lets use SQL to analyze the stock instead of directly manipulating records.

In [7]:
val df = sqlContext.read.
    format("com.databricks.spark.csv").
    option("header", "true").
    option("inferSchema", "true").
    load("AAPL.csv")

Name: Compile Error
Message: <console>:17: error: not found: value sqlContext
       val df = sqlContext.read.
                ^
StackTrace: 

## View DataFrame

In [8]:
df.select("Date","Adj Close").show

Name: Compile Error
Message: <console>:18: error: not found: value df
       df.select("Date","Adj Close").show
       ^
StackTrace: 

## SQL Queries
Register it as a SQL table.

In [9]:
df.registerTempTable("aapl")
sqlContext.sql("SELECT COUNT(1) AS row_count FROM aapl").show

Name: Compile Error
Message: <console>:18: error: not found: value df
       df.registerTempTable("aapl")
       ^
StackTrace: 

## Highest prices

In [10]:
sqlContext.sql("SELECT MAX(`Adj Close`) AS max_close FROM aapl").show

Name: Compile Error
Message: <console>:18: error: not found: value sqlContext
       sqlContext.sql("SELECT MAX(`Adj Close`) AS max_close FROM aapl").show
       ^
StackTrace: 

In [11]:
sqlContext.sql("""SELECT Date,`Adj Close` FROM aapl 
    ORDER BY `Adj Close` DESC LIMIT 5""").show

Name: Compile Error
Message: <console>:18: error: not found: value sqlContext
sqlContext.sql("""SELECT Date,`Adj Close` FROM aapl
^
StackTrace: 