# Getting Started with DataFrames!

Most Important Link: http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Dataset

### Start a simple Spark Session

In [3]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.SparkSession


In [4]:
val spark = SparkSession.builder().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3472f5ad


### Create a DataFrame from Spark Session read csv, technically known as class Dataset

In [10]:
val df = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("CitiGroup2006_2008")

df: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 4 more fields]


### Get first 5 rows

In [12]:
df.head(5)

res1: Array[org.apache.spark.sql.Row] = Array([2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660], [2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020], [2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160], [2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250], [2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740])


In [14]:
for (line <- df.head(10)){
    println(line)
}

[2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660]
[2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020]
[2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160]
[2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250]
[2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740]
[2006-01-10 00:00:00.0,483.0,485.5,480.8,485.4,1365960]
[2006-01-11 00:00:00.0,495.8,495.8,485.8,489.8,1684440]
[2006-01-12 00:00:00.0,491.0,491.0,488.8,490.3,1230060]
[2006-01-13 00:00:00.0,491.0,491.9,487.3,489.2,940930]
[2006-01-17 00:00:00.0,485.1,487.0,482.7,484.3,1237830]


### Get column names

In [15]:
df.columns

res4: Array[String] = Array(Date, Open, High, Low, Close, Volume)


### Find out DataTypes

In [16]:
df.printSchema

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



### Describe DataFrame Numerical Columns

In [19]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+-----------------+
|summary|              Open|             High|               Low|             Close|           Volume|
+-------+------------------+-----------------+------------------+------------------+-----------------+
|  count|               755|              755|               755|               755|              755|
|   mean| 386.0923178807949|390.6590596026489|380.80170860927143| 385.3421456953643|6308596.382781457|
| stddev|149.32301134820133|148.5151130063523|150.53136890891344|149.83310074439177| 8099892.56297633|
|    min|              54.4|             55.3|              30.5|              37.7|           632860|
|    max|             566.0|            570.0|             555.5|             564.1|        102869289|
+-------+------------------+-----------------+------------------+------------------+-----------------+



### Select columns .transform().action()

In [21]:
df.select("Volume").show(5)

+-------+
| Volume|
+-------+
|1537660|
|1871020|
|1143160|
|1370250|
|1680740|
+-------+
only showing top 5 rows



### Multiple Columns

In [22]:
df.select("Date","Close").show(5)

+-------------------+-----+
|               Date|Close|
+-------------------+-----+
|2006-01-03 00:00:00|492.9|
|2006-01-04 00:00:00|483.8|
|2006-01-05 00:00:00|486.2|
|2006-01-06 00:00:00|486.2|
|2006-01-09 00:00:00|483.9|
+-------------------+-----+
only showing top 5 rows



### Creating New Columns

In [23]:
val df2 = df.withColumn("HighPlusLow", df("High")+df("Low"))

df2: org.apache.spark.sql.DataFrame = [Date: timestamp, Open: double ... 5 more fields]


### Show result

In [24]:
df2.columns

res12: Array[String] = Array(Date, Open, High, Low, Close, Volume, HighPlusLow)


In [25]:
df2.printSchema

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- HighPlusLow: double (nullable = true)



In [26]:
df2.show(5)

+-------------------+-----+-----+-----+-----+-------+-----------------+
|               Date| Open| High|  Low|Close| Volume|      HighPlusLow|
+-------------------+-----+-----+-----+-----+-------+-----------------+
|2006-01-03 00:00:00|490.0|493.8|481.1|492.9|1537660|974.9000000000001|
|2006-01-04 00:00:00|488.6|491.0|483.5|483.8|1871020|            974.5|
|2006-01-05 00:00:00|484.4|487.8|484.0|486.2|1143160|            971.8|
|2006-01-06 00:00:00|488.8|489.0|482.0|486.2|1370250|            971.0|
|2006-01-09 00:00:00|486.0|487.4|483.0|483.9|1680740|            970.4|
+-------------------+-----+-----+-----+-----+-------+-----------------+
only showing top 5 rows



### Recheck Head

In [27]:
df2.head(5)

res15: Array[org.apache.spark.sql.Row] = Array([2006-01-03 00:00:00.0,490.0,493.8,481.1,492.9,1537660,974.9000000000001], [2006-01-04 00:00:00.0,488.6,491.0,483.5,483.8,1871020,974.5], [2006-01-05 00:00:00.0,484.4,487.8,484.0,486.2,1143160,971.8], [2006-01-06 00:00:00.0,488.8,489.0,482.0,486.2,1370250,971.0], [2006-01-09 00:00:00.0,486.0,487.4,483.0,483.9,1680740,970.4])


### Aliasing Columns (and selecting some more)

In [31]:
df2.select(df2("HighPlusLow").as("HPL"),df2("Close")).show(5)

+-----------------+-----+
|              HPL|Close|
+-----------------+-----+
|974.9000000000001|492.9|
|            974.5|483.8|
|            971.8|486.2|
|            971.0|486.2|
|            970.4|483.9|
+-----------------+-----+
only showing top 5 rows



## Thank You!