# Session : 1

### Main entry point for DataFrame and SQL functionality

In [1]:
from pyspark.sql import SparkSession
# SparkSession --> entry point to programming Spark with the Dataset and DataFrame API

### Start SPARK session

In [2]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

# Builder Class ---> To create SparkSession Instances
# AppName ---> Sets a name for the application, which will be shown in the Spark web UI


### Import JSON File

In [3]:
df =spark.read.json('/home/bluepi/Documents/MOCK_DATA.json')

### Show the Data

In [4]:
df.show()

+----+----------+
| age|      name|
+----+----------+
|null|Wilhelmina|
|null| Raffaello|
|  27|   Johnath|
|  29|    Luella|
|  41|     Kacie|
|null|     Geoff|
|  37|  Jocelyne|
|  29|     Dodie|
|null|   Desmund|
|null|    Kaiser|
+----+----------+



### PRINT Schema of DataFrame

In [5]:

df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
df.columns

['age', 'name']

In [7]:
df.describe().show()

+-------+----------------+----------+
|summary|             age|      name|
+-------+----------------+----------+
|  count|               5|        10|
|   mean|            32.6|      null|
| stddev|6.06630035524124|      null|
|    min|              27|   Desmund|
|    max|              41|Wilhelmina|
+-------+----------------+----------+



### Manually set DataFrame Schema

In [8]:
from pyspark.sql.types import StructField,IntegerType,StringType,StructType

In [9]:
# Create the list of structure fields--> three parameters ---> Name, DataType, Nulables
data_schema = [
    StructField('age', IntegerType(), True),
    StructField('name', StringType(), True)
]

In [10]:
# Schema which we are accepting
final_struc = StructType(fields=data_schema)

In [11]:
# taking "final_struc" as a final schema of our dataset
df = spark.read.json('/home/bluepi/Documents/MOCK_DATA.json',
                     schema=final_struc)

In [12]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



# SPARK DataFrame Basics

### Select v/s Grab Data

In [13]:
# By running this we get an column object
df['age']

Column<b'age'>

In [14]:
type( df['age'] )
# Column object returned

pyspark.sql.column.Column

In [15]:
# To get a dataFrame 
df.select('age')

DataFrame[age: int]

In [16]:
df.select('age').show()

+----+
| age|
+----+
|null|
|null|
|  27|
|  29|
|  41|
|null|
|  37|
|  29|
|null|
|null|
+----+



In [17]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [18]:
# Select multiple Columns
df.select(['age', 'name']).show()

+----+----------+
| age|      name|
+----+----------+
|null|Wilhelmina|
|null| Raffaello|
|  27|   Johnath|
|  29|    Luella|
|  41|     Kacie|
|null|     Geoff|
|  37|  Jocelyne|
|  29|     Dodie|
|null|   Desmund|
|null|    Kaiser|
+----+----------+



### Create NEW Columns or replace EXISTING Columns

In [19]:
# PASS --> Name of the column & a column
df.withColumn('new_age', df['age'] * 2).show()

+----+----------+-------+
| age|      name|new_age|
+----+----------+-------+
|null|Wilhelmina|   null|
|null| Raffaello|   null|
|  27|   Johnath|     54|
|  29|    Luella|     58|
|  41|     Kacie|     82|
|null|     Geoff|   null|
|  37|  Jocelyne|     74|
|  29|     Dodie|     58|
|null|   Desmund|   null|
|null|    Kaiser|   null|
+----+----------+-------+



In [20]:
# Rename Column name
df.withColumnRenamed('age','AGE').show()

+----+----------+
| AGE|      name|
+----+----------+
|null|Wilhelmina|
|null| Raffaello|
|  27|   Johnath|
|  29|    Luella|
|  41|     Kacie|
|null|     Geoff|
|  37|  Jocelyne|
|  29|     Dodie|
|null|   Desmund|
|null|    Kaiser|
+----+----------+



### How to add a constant column

In [1]:
# In spark 2.2 there are two ways to add constant value in a column in DataFrame:

# 1) Using lit: Creates a Column of literal value.

import org.apache.spark.sql.functions.lit
# Adding constant string value in new column named newcol:
newdf = df.withColumn("newcol",lit("myval"))


SyntaxError: invalid syntax (<ipython-input-1-ae03c7b5c4c2>, line 12)

### Registering DataFrame as a SQL temp VIEW

In [21]:
df.createOrReplaceTempView("MOCK_DATA")

### To run SQL Queries Directly

In [22]:
result = spark.sql("SELECT * FROM MOCK_DATA WHERE name IN ('Geoff','Kaiser')")

In [23]:
# TO show the result
result.show()

+----+------+
| age|  name|
+----+------+
|null| Geoff|
|null|Kaiser|
+----+------+



# Session  : 2

In [24]:
df = spark.read.csv('/home/bluepi/Documents/BluePi/Tutorial/AAPL.csv',inferSchema=True, header=True )

In [25]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [26]:
df.show()

+-------------------+----------+----------+----------+----------+----------+---------+
|               Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+-------------------+----------+----------+----------+----------+----------+---------+
|2014-09-29 00:00:00|100.589996|100.690002| 98.040001| 99.620003|  93.51429|142718700|
|2014-10-06 00:00:00| 99.949997|102.379997| 98.309998|100.730003| 94.556244|280258200|
|2014-10-13 00:00:00|101.330002|101.779999|     95.18| 97.669998| 91.683792|358539800|
|2014-10-20 00:00:00|     98.32|105.489998| 98.220001|105.220001| 98.771042|358532900|
|2014-10-27 00:00:00|104.849998|108.040001|104.699997|     108.0|101.380676|220230600|
|2014-11-03 00:00:00|108.220001|110.300003|107.720001|109.010002|102.328766|199952900|
|2014-11-10 00:00:00|109.019997|114.190002|108.400002|    114.18|107.646675|205166700|
|2014-11-17 00:00:00|114.269997|    117.57|113.300003|116.470001|109.805626|233414700|
|2014-11-24 00:00:00|116.849998|    119.75|

In [27]:
df.head(1)[0]

Row(Date=datetime.datetime(2014, 9, 29, 0, 0), Open=100.589996, High=100.690002, Low=98.040001, Close=99.620003, Adj Close=93.51429, Volume=142718700)

### **filter()** --> filter rows using given condition

In [28]:
# Method 1 :
df.filter("Close < 500").show()

+-------------------+----------+----------+----------+----------+----------+---------+
|               Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+-------------------+----------+----------+----------+----------+----------+---------+
|2014-09-29 00:00:00|100.589996|100.690002| 98.040001| 99.620003|  93.51429|142718700|
|2014-10-06 00:00:00| 99.949997|102.379997| 98.309998|100.730003| 94.556244|280258200|
|2014-10-13 00:00:00|101.330002|101.779999|     95.18| 97.669998| 91.683792|358539800|
|2014-10-20 00:00:00|     98.32|105.489998| 98.220001|105.220001| 98.771042|358532900|
|2014-10-27 00:00:00|104.849998|108.040001|104.699997|     108.0|101.380676|220230600|
|2014-11-03 00:00:00|108.220001|110.300003|107.720001|109.010002|102.328766|199952900|
|2014-11-10 00:00:00|109.019997|114.190002|108.400002|    114.18|107.646675|205166700|
|2014-11-17 00:00:00|114.269997|    117.57|113.300003|116.470001|109.805626|233414700|
|2014-11-24 00:00:00|116.849998|    119.75|

In [29]:
# Selecting columns using "select"
df.filter("Close < 500").select('Open').show()

+----------+
|      Open|
+----------+
|100.589996|
| 99.949997|
|101.330002|
|     98.32|
|104.849998|
|108.220001|
|109.019997|
|114.269997|
|116.849998|
|118.809998|
|114.099998|
|110.699997|
|112.160004|
|113.790001|
|108.290001|
|112.599998|
|107.839996|
|113.739998|
|118.050003|
|118.550003|
+----------+
only showing top 20 rows



In [30]:
df.filter(df['Close'] < 500 ).select('Open').show()

+----------+
|      Open|
+----------+
|100.589996|
| 99.949997|
|101.330002|
|     98.32|
|104.849998|
|108.220001|
|109.019997|
|114.269997|
|116.849998|
|118.809998|
|114.099998|
|110.699997|
|112.160004|
|113.790001|
|108.290001|
|112.599998|
|107.839996|
|113.739998|
|118.050003|
|118.550003|
+----------+
only showing top 20 rows



In [31]:
# Filtering based on multiple conditions

# Mistakes -->
# df.filter( df['Close'] <500 and df['Close'] > 450 ).show()

# Required to give seperate columns
# df.filter( df['Close'] <500 & df['Close'] > 450 ).show()

df.filter( (df['Close'] <105) & (df['Close'] > 100) ).show()

+-------------------+----------+----------+----------+----------+----------+---------+
|               Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+-------------------+----------+----------+----------+----------+----------+---------+
|2014-10-06 00:00:00| 99.949997|102.379997| 98.309998|100.730003| 94.556244|280258200|
|2016-01-18 00:00:00| 98.410004|101.459999| 93.419998|101.419998| 97.245926|243384100|
|2016-02-29 00:00:00| 96.860001|    103.75| 96.650002|103.010002|  99.30645|201803800|
|2016-03-07 00:00:00|102.389999|102.830002|100.150002|102.260002| 98.583405|155514300|
|2016-05-23 00:00:00| 95.870003|100.730003| 95.669998|100.349998| 97.331093|203888300|
|2016-07-25 00:00:00|     98.25|104.550003| 96.419998|104.209999|101.074974|256571000|
|2016-09-05 00:00:00|107.900002|108.760002|103.129997|103.129997|100.569328|168803700|
+-------------------+----------+----------+----------+----------+----------+---------+



In [32]:
# using NOT Symbol
df.filter( (df['Close'] <105) & ~(df['Close'] > 100) ).show()

+-------------------+----------+----------+---------+---------+---------+---------+
|               Date|      Open|      High|      Low|    Close|Adj Close|   Volume|
+-------------------+----------+----------+---------+---------+---------+---------+
|2014-09-29 00:00:00|100.589996|100.690002|98.040001|99.620003| 93.51429|142718700|
|2014-10-13 00:00:00|101.330002|101.779999|    95.18|97.669998|91.683792|358539800|
|2016-01-04 00:00:00|102.610001|105.849998|    96.43|96.959999| 92.96949|343790200|
|2016-01-11 00:00:00| 98.970001|101.190002|95.360001|97.129997|93.132492|303513300|
|2016-01-25 00:00:00|101.519997|101.529999|92.389999|97.339996|93.333847|380336500|
|2016-02-01 00:00:00| 96.470001| 97.330002|93.690002|94.019997|90.150482|217154800|
|2016-02-08 00:00:00| 93.129997| 96.349998|92.589996|93.989998|90.610756|231122300|
|2016-02-15 00:00:00| 95.019997| 98.889999|94.610001|96.040001|92.587051|168316300|
|2016-02-22 00:00:00| 96.309998| 98.019997|    93.32|96.910004|93.425774|159

Using __collect()__ method to return a **list of row objects** instead of printing the result

In [33]:

df.filter( df['Low'] == 95.18 ).show()

+-------------------+----------+----------+-----+---------+---------+---------+
|               Date|      Open|      High|  Low|    Close|Adj Close|   Volume|
+-------------------+----------+----------+-----+---------+---------+---------+
|2014-10-13 00:00:00|101.330002|101.779999|95.18|97.669998|91.683792|358539800|
+-------------------+----------+----------+-----+---------+---------+---------+



In [34]:
df.filter( df['Low'] == 95.18 ).collect()

[Row(Date=datetime.datetime(2014, 10, 13, 0, 0), Open=101.330002, High=101.779999, Low=95.18, Close=97.669998, Adj Close=91.683792, Volume=358539800)]

In [35]:
result = df.filter( df['Low'] == 95.18 ).collect()
# list is stored in result
result

[Row(Date=datetime.datetime(2014, 10, 13, 0, 0), Open=101.330002, High=101.779999, Low=95.18, Close=97.669998, Adj Close=91.683792, Volume=358539800)]

In [36]:
# getting first item of the list
row = result[0]
result[0]

Row(Date=datetime.datetime(2014, 10, 13, 0, 0), Open=101.330002, High=101.779999, Low=95.18, Close=97.669998, Adj Close=91.683792, Volume=358539800)

In [37]:
# getting elements as a dictionary
row.asDict()

{'Date': datetime.datetime(2014, 10, 13, 0, 0),
 'Open': 101.330002,
 'High': 101.779999,
 'Low': 95.18,
 'Close': 97.669998,
 'Adj Close': 91.683792,
 'Volume': 358539800}

Grabing __keys__ from dictionary

In [38]:
row.asDict()['Open']

101.330002

# Session  : 3 

### GroupBy and Aggregate functions

* **GroupBy** can group values together based on some column values
* **Aggregate** will combines multiple rows of data into a single output
    - it takes dictionary as a arguments key:value --> column:operation


In [39]:
df = spark.read.csv('/home/bluepi/Documents/BluePi/Tutorial/sales_info.csv', inferSchema=True, header=True )
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [40]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [41]:
# Counting total number of companies
df.groupBy("Company").count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [42]:
df.agg( {'Sales':'avg'} ).show()

+-----------------+
|       avg(Sales)|
+-----------------+
|360.5833333333333|
+-----------------+



### Various Functions

In [43]:
from pyspark.sql.functions import countDistinct,avg,stddev

In [44]:
# finding average
df.select(avg('Sales').alias('Average Sales')).show()

+-----------------+
|    Average Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [45]:
# finding Standard Deviation
df.select(stddev('Sales').alias('STD Sales')).show()

+------------------+
|         STD Sales|
+------------------+
|250.08742410799007|
+------------------+



In [46]:
# format_number takes 2 arguments column_name & no. of decimal places
from pyspark.sql.functions import format_number
df.select(stddev('Sales').alias('STD Sales')).select(format_number( 'STD Sales',2 ).alias('STD') ).show()

+------+
|   STD|
+------+
|250.09|
+------+



In [47]:
# Sorting Ascending
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [48]:
# Sorting Descreasing Order
# In this we have to pass the column itself
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



# Session  : 4

### Dates and TimeStamp

In [54]:
df = spark.read.csv('/home/bluepi/Documents/BluePi/Tutorial/AAPL.csv',inferSchema=True, header=True )

In [57]:
df.show()

+-------------------+----------+----------+----------+----------+----------+---------+
|               Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+-------------------+----------+----------+----------+----------+----------+---------+
|2014-09-29 00:00:00|100.589996|100.690002| 98.040001| 99.620003|  93.51429|142718700|
|2014-10-06 00:00:00| 99.949997|102.379997| 98.309998|100.730003| 94.556244|280258200|
|2014-10-13 00:00:00|101.330002|101.779999|     95.18| 97.669998| 91.683792|358539800|
|2014-10-20 00:00:00|     98.32|105.489998| 98.220001|105.220001| 98.771042|358532900|
|2014-10-27 00:00:00|104.849998|108.040001|104.699997|     108.0|101.380676|220230600|
|2014-11-03 00:00:00|108.220001|110.300003|107.720001|109.010002|102.328766|199952900|
|2014-11-10 00:00:00|109.019997|114.190002|108.400002|    114.18|107.646675|205166700|
|2014-11-17 00:00:00|114.269997|    117.57|113.300003|116.470001|109.805626|233414700|
|2014-11-24 00:00:00|116.849998|    119.75|

In [58]:
df.head(1)[0][0]

datetime.datetime(2014, 9, 29, 0, 0)

In [52]:
from pyspark.sql.functions import dayofmonth,hour,dayofyear,month,year,weekofyear, \
                                    format_number,date_format

In [53]:
df.select( year( df['Date'] ) ).show()

+----------+
|year(Date)|
+----------+
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2014|
|      2015|
|      2015|
|      2015|
|      2015|
|      2015|
|      2015|
+----------+
only showing top 20 rows



## Add new rows to Dataframe 

In [None]:
appended = df.union(newRow)
appended.show()