In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Training") \
    .getOrCreate()

#### 1. DataFrame Rows

In [9]:
row = Row(name='Robert', Age=35)
row, type(row)

(Row(name='Robert', Age=35), pyspark.sql.types.Row)

how do you access the individual elements of a row?

In [10]:
row.name

'Robert'

In [11]:
row.Age

35

- How can you check if a certain field exist in a row?
- How can you check if a certain field contains a certain value for the row?

In [12]:
'name' in row

True

In [13]:
'Robert' in row.name

True

#### 1.2.Creating a dataframe using rows
- you always pass in the schema
- then specify the row definition

In [14]:
schema = ['Name', 'Age'] #optional for spark dataframes but not rdds
data = [Row(Name='Alicia', Age=11), Row(Name='Thomas', Age=25)]

In [16]:
df = spark.createDataFrame(data=data)
# schema is optional in this case because the data field contains the names of fields
df = spark.createDataFrame(data=data, schema=schema)

In [17]:
df.show()

+------+---+
|  Name|Age|
+------+---+
|Alicia| 11|
|Thomas| 25|
+------+---+



#### 1.3 Creating rows through classes

In [29]:
Person = Row('Name', 'Age')  #this is our class from which we can create objects
p1 = Person('Alicia', 31)  #p1 objects
p2 = Person('Robert', 35)  #p2 object

In [32]:
data = [Person('Alicia', 31), Person('Robert', 35)]  
df = spark.createDataFrame(data=data)
df.show()

+------+---+
|  Name|Age|
+------+---+
|Alicia| 31|
|Robert| 35|
+------+---+



### 1.4 Important Row methods
- .count(): Returns the number of occurences of values
- .index(): Returns the first index of a value
- .asDict(): Returns the dictionary view of the row

In [34]:
person = Row(name='Alice', age=11, username='Alice')

In [40]:
person.count('Alice')

2

In [37]:
person.index('Alice')

0

In [38]:
person.asDict()

{'name': 'Alice', 'age': 11, 'username': 'Alice'}

### 2.0 Dataframe Columns
Column class doesnt need to be imported as it comes as a default in pyspark

In [28]:
path = "data/sparkify_log_small.json"

df = spark.read.json(path)

In [42]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



### 2.1 Accessing a single column using `select` clause

In [44]:
df.select(df.artist).show(5)   #cleaner syntax

+--------------------+
|              artist|
+--------------------+
|       Showaddywaddy|
|          Lily Allen|
|Cobra Starship Fe...|
|          Alex Smoke|
|                null|
+--------------------+
only showing top 5 rows



In [45]:
# alternatively, pass column name in square bracketts
df.select(df['artist']).show(5)

+--------------------+
|              artist|
+--------------------+
|       Showaddywaddy|
|          Lily Allen|
|Cobra Starship Fe...|
|          Alex Smoke|
|                null|
+--------------------+
only showing top 5 rows



In [49]:
# using F.col
df.select(F.col('artist')).show(5)

+--------------------+
|              artist|
+--------------------+
|       Showaddywaddy|
|          Lily Allen|
|Cobra Starship Fe...|
|          Alex Smoke|
|                null|
+--------------------+
only showing top 5 rows



### 2.2 selecting all columns 
- use *
- use F.col

In [48]:
df.select(F.col('*')).show(2) 

+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+-------------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Showaddywaddy|Logged In|  Kenneth|     M|          112|Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|   Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|      

### 2.3 Giving alias name to columns
`Notes`
- dataframe functions are applied to the left of the column
- column functions are applied to the right of the column

In [50]:
df.select(df.artist.alias('artist_name')).show(5)  

+--------------------+
|         artist_name|
+--------------------+
|       Showaddywaddy|
|          Lily Allen|
|Cobra Starship Fe...|
|          Alex Smoke|
|                null|
+--------------------+
only showing top 5 rows



#### 2.4 Ordering a column 
- asc()
- asc_nulls_first()
- asc_nulls_last()
- desc()
- desc_nulls_first()
- desc_nulls_last()

In [110]:
# sort by descending artist
df.orderBy(df.artist.asc())\
    .select(df.artist)\
    .distinct()\
    .show(10)

+--------------------+
|              artist|
+--------------------+
|      The Black Keys|
|        STRATOVARIUS|
|      The Chameleons|
|Dashboard Confess...|
|      Jarabe De Palo|
|        Ziggy Marley|
|        Yann Tiersen|
|  The Watts Prophets|
|            Goldfish|
|           Kate Nash|
+--------------------+
only showing top 10 rows



#### 2.5. Casting columns
converts columns from one type to another

In [64]:
df.dtypes

[('artist', 'string'),
 ('auth', 'string'),
 ('firstName', 'string'),
 ('gender', 'string'),
 ('itemInSession', 'bigint'),
 ('lastName', 'string'),
 ('length', 'double'),
 ('level', 'string'),
 ('location', 'string'),
 ('method', 'string'),
 ('page', 'string'),
 ('registration', 'bigint'),
 ('sessionId', 'bigint'),
 ('song', 'string'),
 ('status', 'bigint'),
 ('ts', 'bigint'),
 ('userAgent', 'string'),
 ('userId', 'string')]

#### 2.5.1 .cast
cast ts from bigint to string

In [74]:
new_col = df.select(df.ts.cast('string'))
new_col.dtypes

[('ts', 'string')]

### 2.6. Logical Operators
- between()
- contains()
- startswith()
- endswith()
- like()
- rlike()
- isin()

Note:
- when df.select() is used with a logical operator, it returns a columns with True and False as values
- Alternatively, when `df.filter()` or `df.where()` is used, it returns original values and only selection of columns that meet the condition

#### 2.6.1 .between()

In [87]:
# boolean returned when df.select used
df.select(df.status.between(0, 300)).show(5)

+-----------------------------------+
|((status >= 0) AND (status <= 300))|
+-----------------------------------+
|                               true|
|                               true|
|                               true|
|                               true|
|                               true|
+-----------------------------------+
only showing top 5 rows



In [90]:
# filter
df.filter(df.status.between(0, 300)).show(5)

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       Showaddywaddy|Logged In|  Kenneth|     M|          112| Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|          Lily Allen|Logged In|Elizabeth|     F|            7|    Chase|195.23873| free|Shreveport-Bossie...|   PUT

In [91]:
# where
df.where(df.status.between(0, 300)).show(5)

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       Showaddywaddy|Logged In|  Kenneth|     M|          112| Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|          Lily Allen|Logged In|Elizabeth|     F|            7|    Chase|195.23873| free|Shreveport-Bossie...|   PUT

In [93]:
# alternatively, use the booleans to our advantage
df[(df.status.between(0, 300))].show(5)

+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|              artist|     auth|firstName|gender|itemInSession| lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+--------------------+---------+---------+------+-------------+---------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|       Showaddywaddy|Logged In|  Kenneth|     M|          112| Matthews|232.93342| paid|Charlotte-Concord...|   PUT|NextSong|1509380319284|     5132|Christmas Tears W...|   200|1513720872284|"Mozilla/5.0 (Win...|  1046|
|          Lily Allen|Logged In|Elizabeth|     F|            7|    Chase|195.23873| free|Shreveport-Bossie...|   PUT

#### 2.6.2 .contains()

In [103]:
df.where(df.firstName.contains('So'))\
    .select('firstName')\
    .distinct()\
    .show(5)

+---------+
|firstName|
+---------+
|   Sophee|
|   Sophia|
|    Soren|
+---------+



#### 2.6.3 .like()

In [104]:
df.filter(df.firstName.like('%rd%'))\
    .select('firstname')\
    .distinct()\
    .show()

+---------+
|firstname|
+---------+
|  Gerardo|
|   Jordan|
|  Edgardo|
|   Edward|
|   Jordyn|
+---------+



#### 2.6.4 .startswith()

In [106]:
df.where(df.firstName.startswith('L'))\
    .select('firstName')\
    .distinct()\
    .show(5)

+---------+
|firstName|
+---------+
|    Lucas|
|   Lennox|
|    Litzy|
|     Luna|
|    Lance|
+---------+
only showing top 5 rows



#### 2.6.5 .isin()

In [109]:
df[df.firstName.isin('Lucas','Lennox','Litzy')]\
    .select('firstname')\
    .distinct()\
    .show()

+---------+
|firstname|
+---------+
|    Lucas|
|   Lennox|
|    Litzy|
+---------+



### 2.7 Missing value Operations
- eqNullSafe()
- isNull()
- isNotNull()

#### 2.7.1 .eqNullSafe()
Assignments in pyspark do not check the Null or None Values. To check them use this function

In [112]:
df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: long (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [121]:
df.select(df.gender == 'M').distinct().show()

+------------+
|(gender = M)|
+------------+
|        null|
|        true|
|       false|
+------------+



Notice the problem:
- We are searching for distinct values where Gender = M
- We should only get two values: true--> when gender is M and False otherwise. But we get null too
- This does not check null too, ideally, if gender is null, then it should return False

In [126]:
# to correct above
df.select(df.gender.eqNullSafe('M')).distinct().show()

+--------------+
|(gender <=> M)|
+--------------+
|          true|
|         false|
+--------------+



In [130]:
# let's compare what both approaches
df.select(df.gender =='M', df.gender.eqNullSafe('M')).distinct().show()

+------------+--------------+
|(gender = M)|(gender <=> M)|
+------------+--------------+
|        true|          true|
|       false|         false|
|        null|         false|
+------------+--------------+



#### 2.7.2 .isNull()

In [131]:
df.select(df.gender, df.gender.isNull())\
    .distinct()\
    .show()

+------+----------------+
|gender|(gender IS NULL)|
+------+----------------+
|  null|            true|
|     F|           false|
|     M|           false|
+------+----------------+



#### 2.7.3 .isNotNull()

In [132]:
df.select(df.gender, df.gender.isNotNull())\
    .distinct()\
    .show()

+------+--------------------+
|gender|(gender IS NOT NULL)|
+------+--------------------+
|     M|                true|
|     F|                true|
|  null|               false|
+------+--------------------+



### 2.8 .substr()

In [137]:
df.filter(df.firstName.substr(1,4) == 'Eliz').show(4)

+----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|    artist|     auth|firstName|gender|itemInSession|lastName|   length|level|            location|method|    page| registration|sessionId|                song|status|           ts|           userAgent|userId|
+----------+---------+---------+------+-------------+--------+---------+-----+--------------------+------+--------+-------------+---------+--------------------+------+-------------+--------------------+------+
|Lily Allen|Logged In|Elizabeth|     F|            7|   Chase|195.23873| free|Shreveport-Bossie...|   PUT|NextSong|1512718541284|     5027|       Cheryl Tweedy|   200|1513720878284|"Mozilla/5.0 (Win...|  1000|
|Kyle Cease|Logged In|Elizabeth|     F|            0| Wheeler|271.20281| paid|Miami-Fort Lauder...|   PUT|NextSong|1508260038284|     5458| Generation Nintendo|

#### 2.9 getField()

Gets a field by name in the StructField

In [40]:
df1 = spark.createDataFrame([Row(classLetter=Row(a1=1, a2='b')), Row(classLetter=Row(a1=2, a2='bb'))])

df1.printSchema()

root
 |-- classLetter: struct (nullable = true)
 |    |-- a1: long (nullable = true)
 |    |-- a2: string (nullable = true)



In [12]:
df1.show()

+-----------+
|classLetter|
+-----------+
|     {1, b}|
|    {2, bb}|
+-----------+



There's two way's to extract the first (or second) element in of classLetter
- Directly using dot notation i.e .classLetter.a1 (a1 comes from the schema)
- Directly using .getField()

In [16]:
#method 1
df1.select(df1.classLetter.a1).show(), df1.select(df1.classLetter.a2).show()

+--------------+
|classLetter.a1|
+--------------+
|             1|
|             2|
+--------------+

+--------------+
|classLetter.a2|
+--------------+
|             b|
|            bb|
+--------------+



(None, None)

In [19]:
# method 2 - might be useful if you want to create a function that takes in 'a1' as argument
df1.select(df1.classLetter.getField('a1')).show(), df1.select(df1.classLetter.getField('a2')).show()

+--------------+
|classLetter.a1|
+--------------+
|             1|
|             2|
+--------------+

+--------------+
|classLetter.a2|
+--------------+
|             b|
|            bb|
+--------------+



(None, None)

#### 2.91 getItem()
- Similar to getField but gets an item at position n out of a list, or gets the item by key out of a dict

Alternatively:
- Use slicing to get it

In [36]:
df2 = spark.createDataFrame([([1,2], {'Key': 'value'})], ['lst', 'dict'])

In [38]:
df2.show(5)

+------+--------------+
|   lst|          dict|
+------+--------------+
|[1, 2]|{Key -> value}|
+------+--------------+



In [48]:
# Using getItem()
df2.select(df2.lst.getItem(1), df2.dict.getItem('Key')).show()  #zero based indexing applies

+------+---------+
|lst[1]|dict[Key]|
+------+---------+
|     2|    value|
+------+---------+



In [50]:
# using slicing
df2.select(df2.lst[1], df2.dict['Key']).show()

+------+---------+
|lst[1]|dict[Key]|
+------+---------+
|     2|    value|
+------+---------+



### 3. Conditional Statements 
- `when(), otherwise()`: Similar to if-else statements in SQL. Using this, we can check multiple conditions in sequence and return a value when the first condition is met
- when must be imported from functions

In [56]:
df.select('page').distinct().show()

+----------------+
|            page|
+----------------+
|Submit Downgrade|
|            Home|
|       Downgrade|
|          Logout|
|   Save Settings|
|           About|
|        Settings|
|           Login|
|        NextSong|
|            Help|
|         Upgrade|
|           Error|
|  Submit Upgrade|
+----------------+



Task:
- Derive a new field from page, where the new field has the following values in place of the originals

- --> 'Downgrading' for 'Submit Downgrade' and 'Downgrade'
- --> 'Upgrading' for ['Submit Upgrade', 'Upgrade']
- --> 'logging out' for Logout
- --> otherwise leave original values

In [70]:
#only first instance of when is aliased, other instances arent
df.select(df.page, \
         F.when(df.page.isin('Submit Downgrade', 'Downgrade'), 'Downgrading')\
         .when(df.page.isin('Submit Upgrade', 'Upgrade'), 'Upgrading')\
         .when(df.page == 'Logout', 'logging out')\
         .otherwise(df.page).alias('method_ps'))\
         .distinct().show(10)

+----------------+-------------+
|            page|    method_ps|
+----------------+-------------+
|  Submit Upgrade|    Upgrading|
|   Save Settings|Save Settings|
|           About|        About|
|          Logout|  logging out|
|            Home|         Home|
|       Downgrade|  Downgrading|
|            Help|         Help|
|        Settings|     Settings|
|Submit Downgrade|  Downgrading|
|           Login|        Login|
+----------------+-------------+
only showing top 10 rows

