In [68]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [4]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [5]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [6]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [7]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [8]:
mpg.hwy

Column<'hwy'>

In [9]:
mpg.select(mpg.hwy, mpg.cty, mpg.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [10]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show(10)

+---+---+----------+
|hwy|cty|     model|
+---+---+----------+
| 29| 18|        a4|
| 29| 21|        a4|
| 31| 20|        a4|
| 30| 21|        a4|
| 26| 16|        a4|
| 26| 18|        a4|
| 27| 18|        a4|
| 26| 18|a4 quattro|
| 25| 16|a4 quattro|
| 28| 20|a4 quattro|
+---+---+----------+
only showing top 10 rows



In [11]:
mpg.hwy + 1

Column<'(hwy + 1)'>

In [12]:
mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



In [13]:
mpg.select(mpg.hwy.alias("highway_mileage")).show(5)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



In [14]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [15]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [16]:
df.show(3)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.8356462888346827|    z| true|
|-1.8868228152944575|    x| true|
|0.42476258526975835|    y| true|
+-------------------+-----+-----+
only showing top 3 rows



In [17]:
df.show(7)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.8356462888346827|    z| true|
|-1.8868228152944575|    x| true|
|0.42476258526975835|    y| true|
|-0.4331049843349663|    y| true|
|-0.1668376625912182|    x|false|
|  0.754634059814846|    y| true|
| 1.9669745181466287|    y|false|
+-------------------+-----+-----+
only showing top 7 rows



In [18]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [19]:
df.select(df.n, df.abool).show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
| 1.8356462888346827| true|
|-1.8868228152944575| true|
|0.42476258526975835| true|
|-0.4331049843349663| true|
|-0.1668376625912182|false|
+-------------------+-----+
only showing top 5 rows



In [20]:
roar = df.select(df.n, df.abool)
roar.show(5)

+-------------------+-----+
|                  n|abool|
+-------------------+-----+
| 1.8356462888346827| true|
|-1.8868228152944575| true|
|0.42476258526975835| true|
|-0.4331049843349663| true|
|-0.1668376625912182|false|
+-------------------+-----+
only showing top 5 rows



In [21]:
df.select(df.group, df.abool.alias("a_boolean_value")).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|           true|
|    x|           true|
|    y|           true|
+-----+---------------+
only showing top 3 rows



In [22]:
df.select(df.group, df.n.alias("a_numeric_value")).show(7)

+-----+-------------------+
|group|    a_numeric_value|
+-----+-------------------+
|    z| 1.8356462888346827|
|    x|-1.8868228152944575|
|    y|0.42476258526975835|
|    y|-0.4331049843349663|
|    x|-0.1668376625912182|
|    y|  0.754634059814846|
|    y| 1.9669745181466287|
+-----+-------------------+
only showing top 7 rows



In [23]:
df.select(df.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
| 5.835646288834683|
|2.1131771847055427|
| 4.424762585269758|
| 3.566895015665034|
| 3.833162337408782|
| 4.754634059814846|
| 5.966974518146628|
| 4.200671078139285|
| 4.742751638585151|
| 2.735729301957435|
|  3.88721255597881|
| 4.667358048705604|
| 4.357899658498672|
| 3.786425092774556|
|2.7613805021428215|
| 3.574821590151561|
| 4.483427919286223|
| 4.348016122917654|
| 4.758573771323611|
| 4.212900837234731|
+------------------+



In [24]:
 df.select(df.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -3.164353711165317|
| -6.886822815294457|
| -4.575237414730242|
| -5.433104984334967|
|-5.1668376625912185|
| -4.245365940185154|
|-3.0330254818533713|
| -4.799328921860715|
| -4.257248361414849|
| -6.264270698042565|
|-5.1127874440211905|
| -4.332641951294396|
| -4.642100341501328|
| -5.213574907225444|
|-6.2386194978571785|
|-5.4251784098484395|
| -4.516572080713777|
| -4.651983877082346|
| -4.241426228676389|
| -4.787099162765269|
+-------------------+



In [25]:
df.select(df.n, df.n * 2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  1.8356462888346827|  3.6712925776693655|
| -1.8868228152944575|  -3.773645630588915|
| 0.42476258526975835|  0.8495251705395167|
| -0.4331049843349663| -0.8662099686699326|
| -0.1668376625912182| -0.3336753251824364|
|   0.754634059814846|   1.509268119629692|
|  1.9669745181466287|  3.9339490362932574|
| 0.20067107813928464|  0.4013421562785693|
|  0.7427516385851509|  1.4855032771703018|
|  -1.264270698042565|   -2.52854139608513|
| -0.1127874440211902| -0.2255748880423804|
|  0.6673580487056044|   1.334716097411209|
| 0.35789965849867184|  0.7157993169973437|
|-0.21357490722544392|-0.42714981445088784|
| -1.2386194978571787| -2.4772389957143575|
| -0.4251784098484392| -0.8503568196968784|
|  0.4834279192862231|  0.9668558385724462|
| 0.34801612291765394|  0.6960322458353079|
|  0.7585737713236111|  1.5171475426472223|
| 0.21290083723473058| 0.4258016

In [26]:
df = df.withColumn('n2', df.n*-1)
df.show()

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  1.8356462888346827|    z| true| -1.8356462888346827|
| -1.8868228152944575|    x| true|  1.8868228152944575|
| 0.42476258526975835|    y| true|-0.42476258526975835|
| -0.4331049843349663|    y| true|  0.4331049843349663|
| -0.1668376625912182|    x|false|  0.1668376625912182|
|   0.754634059814846|    y| true|  -0.754634059814846|
|  1.9669745181466287|    y|false| -1.9669745181466287|
| 0.20067107813928464|    z|false|-0.20067107813928464|
|  0.7427516385851509|    x|false| -0.7427516385851509|
|  -1.264270698042565|    z|false|   1.264270698042565|
| -0.1127874440211902|    z| true|  0.1127874440211902|
|  0.6673580487056044|    x| true| -0.6673580487056044|
| 0.35789965849867184|    x|false|-0.35789965849867184|
|-0.21357490722544392|    x|false| 0.21357490722544392|
| -1.2386194978571787|    x| true|  1.2386194978

In [27]:
df = df.withColumn('n3', df.n**2)
df.show()

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  1.8356462888346827|    z| true| -1.8356462888346827|  3.3695972977125432|
| -1.8868228152944575|    x| true|  1.8868228152944575|  3.5601003363157027|
| 0.42476258526975835|    y| true|-0.42476258526975835| 0.18042325384504873|
| -0.4331049843349663|    y| true|  0.4331049843349663|  0.1875799274557914|
| -0.1668376625912182|    x|false|  0.1668376625912182|0.027834805658901174|
|   0.754634059814846|    y| true|  -0.754634059814846|  0.5694725642326366|
|  1.9669745181466287|    y|false| -1.9669745181466287|   3.868988755038162|
| 0.20067107813928464|    z|false|-0.20067107813928464| 0.04026888160158288|
|  0.7427516385851509|    x|false| -0.7427516385851509|  0.5516799966209267|
|  -1.264270698042565|    z|false|   1.264270698042565|  1.5983803979290347|

In [28]:
df.group + df.abool

Column<'(group + abool)'>

In [29]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)
 |-- n2: double (nullable = true)
 |-- n3: double (nullable = true)



In [30]:
df.dtypes

[('n', 'double'),
 ('group', 'string'),
 ('abool', 'boolean'),
 ('n2', 'double'),
 ('n3', 'double')]

In [31]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [32]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    1|
|    1|
|    1|
|    0|
+-----+



In [33]:
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    1|
|    1|
|    1|
|    0|
+-----+



In [34]:
df = df.withColumn('abool', df.abool.cast('int'))
df = df.withColumn('group', df.group.cast('int'))
df = df.withColumn('n', df.n.cast('int'))
df = df.withColumn('abool', df.abool.cast('string'))

In [35]:
df.show()

+---+-----+-----+--------------------+--------------------+
|  n|group|abool|                  n2|                  n3|
+---+-----+-----+--------------------+--------------------+
|  1| null|    1| -1.8356462888346827|  3.3695972977125432|
| -1| null|    1|  1.8868228152944575|  3.5601003363157027|
|  0| null|    1|-0.42476258526975835| 0.18042325384504873|
|  0| null|    1|  0.4331049843349663|  0.1875799274557914|
|  0| null|    0|  0.1668376625912182|0.027834805658901174|
|  0| null|    1|  -0.754634059814846|  0.5694725642326366|
|  1| null|    0| -1.9669745181466287|   3.868988755038162|
|  0| null|    0|-0.20067107813928464| 0.04026888160158288|
|  0| null|    0| -0.7427516385851509|  0.5516799966209267|
| -1| null|    0|   1.264270698042565|  1.5983803979290347|
|  0| null|    1|  0.1127874440211902|0.012721007528833114|
|  0| null|    1| -0.6673580487056044|  0.4453667651721519|
|  0| null|    0|-0.35789965849867184| 0.12809216555346592|
|  0| null|    0| 0.21357490722544392|0.

In [40]:
from pyspark.sql.functions import asc, desc, col, avg, concat, lit, mean
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [41]:
df.sort(desc('n')).show(1)

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.9669745181466287|    y|false|
+------------------+-----+-----+
only showing top 1 row



In [42]:
df.sort(asc('n')).show(1)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|-1.8868228152944575|    x| true|
+-------------------+-----+-----+
only showing top 1 row



In [43]:
df.agg(mean('n')).show()

+-------------------+
|             avg(n)|
+-------------------+
|0.15062100537706935|
+-------------------+



In [44]:
df.select(concat(lit('Group : '), 'group')).show()

+-----------------------+
|concat(Group : , group)|
+-----------------------+
|              Group : z|
|              Group : x|
|              Group : y|
|              Group : y|
|              Group : x|
|              Group : y|
|              Group : y|
|              Group : z|
|              Group : x|
|              Group : z|
|              Group : z|
|              Group : x|
|              Group : x|
|              Group : x|
|              Group : x|
|              Group : z|
|              Group : z|
|              Group : x|
|              Group : y|
|              Group : x|
+-----------------------+



In [45]:
df.select(concat('group', lit(': '), 'n')).show()

+--------------------+
|concat(group, : , n)|
+--------------------+
|z: 1.835646288834...|
|x: -1.88682281529...|
|y: 0.424762585269...|
|y: -0.43310498433...|
|x: -0.16683766259...|
|y: 0.754634059814846|
|y: 1.966974518146...|
|z: 0.200671078139...|
|x: 0.742751638585...|
|z: -1.26427069804...|
|z: -0.11278744402...|
|x: 0.667358048705...|
|x: 0.357899658498...|
|x: -0.21357490722...|
|x: -1.23861949785...|
|z: -0.42517840984...|
|z: 0.483427919286...|
|x: 0.348016122917...|
|y: 0.758573771323...|
|x: 0.212900837234...|
+--------------------+



In [117]:
from pyspark.sql.functions import asc, desc, col, avg, concat, lit, when, mean, regexp_extract, regexp_replace, expr
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [47]:
# Use when and .otherwise to create a column that contains the text "It is true" when abool is true 
# and "It is false"" when abool is false.

df.select(
    'abool',
    (when(df.abool == 'true', "It is true")
     .otherwise('It is false')
     .alias("What's the bool"))
).show()

+-----+---------------+
|abool|What's the bool|
+-----+---------------+
| true|     It is true|
| true|     It is true|
| true|     It is true|
| true|     It is true|
|false|    It is false|
| true|     It is true|
|false|    It is false|
|false|    It is false|
|false|    It is false|
|false|    It is false|
| true|     It is true|
| true|     It is true|
|false|    It is false|
|false|    It is false|
| true|     It is true|
| true|     It is true|
| true|     It is true|
| true|     It is true|
| true|     It is true|
|false|    It is false|
+-----+---------------+



In [48]:
# Create a column that contains 0 if n is less than 0, otherwise, the original n value.

df.select(
    'n',
    (when(df.n < 0 , 0)
     .otherwise(df.n)
     .alias("Numbers"))
).show()

+--------------------+-------------------+
|                   n|            Numbers|
+--------------------+-------------------+
|  1.8356462888346827| 1.8356462888346827|
| -1.8868228152944575|                0.0|
| 0.42476258526975835|0.42476258526975835|
| -0.4331049843349663|                0.0|
| -0.1668376625912182|                0.0|
|   0.754634059814846|  0.754634059814846|
|  1.9669745181466287| 1.9669745181466287|
| 0.20067107813928464|0.20067107813928464|
|  0.7427516385851509| 0.7427516385851509|
|  -1.264270698042565|                0.0|
| -0.1127874440211902|                0.0|
|  0.6673580487056044| 0.6673580487056044|
| 0.35789965849867184|0.35789965849867184|
|-0.21357490722544392|                0.0|
| -1.2386194978571787|                0.0|
| -0.4251784098484392|                0.0|
|  0.4834279192862231| 0.4834279192862231|
| 0.34801612291765394|0.34801612291765394|
|  0.7585737713236111| 0.7585737713236111|
| 0.21290083723473058|0.21290083723473058|
+----------

In [49]:
# Use .filter or .where to select just the rows where the group is y and view the results.

df.where(df.group == 'y').show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.42476258526975835|    y| true|
|-0.4331049843349663|    y| true|
|  0.754634059814846|    y| true|
| 1.9669745181466287|    y|false|
| 0.7585737713236111|    y| true|
+-------------------+-----+-----+



In [50]:
# Select just the columns where the abool column is false and view the results.

df.where(df.abool == 'false').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.1668376625912182|    x|false|
|  1.9669745181466287|    y|false|
| 0.20067107813928464|    z|false|
|  0.7427516385851509|    x|false|
|  -1.264270698042565|    z|false|
| 0.35789965849867184|    x|false|
|-0.21357490722544392|    x|false|
| 0.21290083723473058|    x|false|
+--------------------+-----+-----+



In [51]:
# Find the columns where the group column is not y.

df.where(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  1.8356462888346827|    z| true|
| -1.8868228152944575|    x| true|
| -0.1668376625912182|    x|false|
| 0.20067107813928464|    z|false|
|  0.7427516385851509|    x|false|
|  -1.264270698042565|    z|false|
| -0.1127874440211902|    z| true|
|  0.6673580487056044|    x| true|
| 0.35789965849867184|    x|false|
|-0.21357490722544392|    x|false|
| -1.2386194978571787|    x| true|
| -0.4251784098484392|    z| true|
|  0.4834279192862231|    z| true|
| 0.34801612291765394|    x| true|
| 0.21290083723473058|    x|false|
+--------------------+-----+-----+



In [52]:
# Find the columns where n is positive.

df.where(df.n > 0).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.8356462888346827|    z| true|
|0.42476258526975835|    y| true|
|  0.754634059814846|    y| true|
| 1.9669745181466287|    y|false|
|0.20067107813928464|    z|false|
| 0.7427516385851509|    x|false|
| 0.6673580487056044|    x| true|
|0.35789965849867184|    x|false|
| 0.4834279192862231|    z| true|
|0.34801612291765394|    x| true|
| 0.7585737713236111|    y| true|
|0.21290083723473058|    x|false|
+-------------------+-----+-----+



In [53]:
# Find the columns where abool is true and the group column is z.

df.where(df.abool == 'true').where(df.group == 'z').show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.8356462888346827|    z| true|
|-0.1127874440211902|    z| true|
|-0.4251784098484392|    z| true|
| 0.4834279192862231|    z| true|
+-------------------+-----+-----+



In [54]:
# Find the columns where abool is true or the group column is z.

df.filter((df.group == "z") | (df.abool == 'true')).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
| 1.8356462888346827|    z| true|
|-1.8868228152944575|    x| true|
|0.42476258526975835|    y| true|
|-0.4331049843349663|    y| true|
|  0.754634059814846|    y| true|
|0.20067107813928464|    z|false|
| -1.264270698042565|    z|false|
|-0.1127874440211902|    z| true|
| 0.6673580487056044|    x| true|
|-1.2386194978571787|    x| true|
|-0.4251784098484392|    z| true|
| 0.4834279192862231|    z| true|
|0.34801612291765394|    x| true|
| 0.7585737713236111|    y| true|
+-------------------+-----+-----+



In [55]:
# Find the columns where abool is false and n is less than 1

df.where(df.abool == 'false').where(df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.1668376625912182|    x|false|
| 0.20067107813928464|    z|false|
|  0.7427516385851509|    x|false|
|  -1.264270698042565|    z|false|
| 0.35789965849867184|    x|false|
|-0.21357490722544392|    x|false|
| 0.21290083723473058|    x|false|
+--------------------+-----+-----+



In [56]:
# Find the columns where abool is false or n is less than 1

df.filter((df.abool == 'false') | (df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.8868228152944575|    x| true|
| 0.42476258526975835|    y| true|
| -0.4331049843349663|    y| true|
| -0.1668376625912182|    x|false|
|   0.754634059814846|    y| true|
|  1.9669745181466287|    y|false|
| 0.20067107813928464|    z|false|
|  0.7427516385851509|    x|false|
|  -1.264270698042565|    z|false|
| -0.1127874440211902|    z| true|
|  0.6673580487056044|    x| true|
| 0.35789965849867184|    x|false|
|-0.21357490722544392|    x|false|
| -1.2386194978571787|    x| true|
| -0.4251784098484392|    z| true|
|  0.4834279192862231|    z| true|
| 0.34801612291765394|    x| true|
|  0.7585737713236111|    y| true|
| 0.21290083723473058|    x|false|
+--------------------+-----+-----+



In [57]:
# Sort by the n value.

df.sort('n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.8868228152944575|    x| true|
|  -1.264270698042565|    z|false|
| -1.2386194978571787|    x| true|
| -0.4331049843349663|    y| true|
| -0.4251784098484392|    z| true|
|-0.21357490722544392|    x|false|
| -0.1668376625912182|    x|false|
| -0.1127874440211902|    z| true|
| 0.20067107813928464|    z|false|
| 0.21290083723473058|    x|false|
| 0.34801612291765394|    x| true|
| 0.35789965849867184|    x|false|
| 0.42476258526975835|    y| true|
|  0.4834279192862231|    z| true|
|  0.6673580487056044|    x| true|
|  0.7427516385851509|    x|false|
|   0.754634059814846|    y| true|
|  0.7585737713236111|    y| true|
|  1.8356462888346827|    z| true|
|  1.9669745181466287|    y|false|
+--------------------+-----+-----+



In [58]:
# Sort by the group value, both ascending and descending.

df.sort(asc('group')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.2386194978571787|    x| true|
| 0.34801612291765394|    x| true|
| 0.21290083723473058|    x|false|
|  0.6673580487056044|    x| true|
|-0.21357490722544392|    x|false|
|  0.7427516385851509|    x|false|
| -1.8868228152944575|    x| true|
| -0.1668376625912182|    x|false|
| 0.35789965849867184|    x|false|
|  0.7585737713236111|    y| true|
| 0.42476258526975835|    y| true|
| -0.4331049843349663|    y| true|
|   0.754634059814846|    y| true|
|  1.9669745181466287|    y|false|
| 0.20067107813928464|    z|false|
|  -1.264270698042565|    z|false|
| -0.1127874440211902|    z| true|
|  1.8356462888346827|    z| true|
| -0.4251784098484392|    z| true|
|  0.4834279192862231|    z| true|
+--------------------+-----+-----+



In [59]:
df.sort(desc('group')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.264270698042565|    z|false|
| -0.1127874440211902|    z| true|
| 0.20067107813928464|    z|false|
|  1.8356462888346827|    z| true|
| -0.4251784098484392|    z| true|
|  0.4834279192862231|    z| true|
|  1.9669745181466287|    y|false|
| 0.42476258526975835|    y| true|
| -0.4331049843349663|    y| true|
|   0.754634059814846|    y| true|
|  0.7585737713236111|    y| true|
| -1.8868228152944575|    x| true|
| -0.1668376625912182|    x|false|
|  0.7427516385851509|    x|false|
| 0.34801612291765394|    x| true|
|  0.6673580487056044|    x| true|
|-0.21357490722544392|    x|false|
| -1.2386194978571787|    x| true|
| 0.21290083723473058|    x|false|
| 0.35789965849867184|    x|false|
+--------------------+-----+-----+



In [60]:
# Sort by the group value first, then, within each group, sort by n value.

df.sort('group', 'n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -1.8868228152944575|    x| true|
| -1.2386194978571787|    x| true|
|-0.21357490722544392|    x|false|
| -0.1668376625912182|    x|false|
| 0.21290083723473058|    x|false|
| 0.34801612291765394|    x| true|
| 0.35789965849867184|    x|false|
|  0.6673580487056044|    x| true|
|  0.7427516385851509|    x|false|
| -0.4331049843349663|    y| true|
| 0.42476258526975835|    y| true|
|   0.754634059814846|    y| true|
|  0.7585737713236111|    y| true|
|  1.9669745181466287|    y|false|
|  -1.264270698042565|    z|false|
| -0.4251784098484392|    z| true|
| -0.1127874440211902|    z| true|
| 0.20067107813928464|    z|false|
|  0.4834279192862231|    z| true|
|  1.8356462888346827|    z| true|
+--------------------+-----+-----+



In [61]:
# Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?
# yes

df.sort('abool', 'group', 'n').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|-0.21357490722544392|    x|false|
| -0.1668376625912182|    x|false|
| 0.21290083723473058|    x|false|
| 0.35789965849867184|    x|false|
|  0.7427516385851509|    x|false|
|  1.9669745181466287|    y|false|
|  -1.264270698042565|    z|false|
| 0.20067107813928464|    z|false|
| -1.8868228152944575|    x| true|
| -1.2386194978571787|    x| true|
| 0.34801612291765394|    x| true|
|  0.6673580487056044|    x| true|
| -0.4331049843349663|    y| true|
| 0.42476258526975835|    y| true|
|   0.754634059814846|    y| true|
|  0.7585737713236111|    y| true|
| -0.4251784098484392|    z| true|
| -0.1127874440211902|    z| true|
|  0.4834279192862231|    z| true|
|  1.8356462888346827|    z| true|
+--------------------+-----+-----+



In [62]:
# Create a spark data frame that contains your favorite programming languages.

languages = ['spark', 'SQL', 'Python', "java", 'HTML', 'Ruby']
df = pd.DataFrame(languages, columns = ['fav_languages'])
df = spark.createDataFrame(df)

In [67]:
# Output the shape of the dataframe

print((df.count(), len(df.columns)))

(6, 1)


In [63]:
# Show the first 5 records in the dataframe

df.show(5)

+-------------+
|fav_languages|
+-------------+
|        spark|
|          SQL|
|       Python|
|         java|
|         HTML|
|         Ruby|
+-------------+



In [130]:
# Load the mpg dataset as a spark dataframe.

mpg = spark.createDataFrame(data('mpg'))

In [131]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [246]:
# Create 1 column of output that contains a message like the one below:

mpg.select(concat(lit('The '), 'year', lit(' '), 'manufacturer', lit(' '),
                  'model', lit(' has a '), 'cyl', lit(' cylinder engine'))).show(truncate = False)


+-----------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  has a , cyl,  cylinder engine)|
+-----------------------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                                     |
|The 1999 audi a4 has a 4 cylinder engine                                     |
|The 2008 audi a4 has a 4 cylinder engine                                     |
|The 2008 audi a4 has a 4 cylinder engine                                     |
|The 1999 audi a4 has a 6 cylinder engine                                     |
|The 1999 audi a4 has a 6 cylinder engine                                     |
|The 2008 audi a4 has a 6 cylinder engine                                     |
|The 1999 audi a4 quattro has a 4 cylinder engine                             |
|The 1999 audi a4 quattro has a 4 cylinder engine                             |
|The 2008 audi a4 quattro has a 4 cylind

In [133]:
# Transform the trans column so that it only contains either manual or auto

mpg = mpg.select(
    '*',
    (when(expr('trans LIKE "manual%"'), "manual")
     .otherwise('auto')
     .alias("trans"))
)

In [152]:
# Load the tips dataset as a spark dataframe.

tips = spark.createDataFrame(data('tips'))

In [153]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [154]:
# What percentage of observations are smokers?

countsBySmoker = tips.groupBy("smoker").count()
countsBySmoker.show()
93/ (93+151)

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



0.38114754098360654

In [155]:
# Create a column that contains the tip percentage

col = tips.tip / tips.total_bill
tips = tips.select('*', col.alias('tip_percentage'))
tips.show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

In [156]:
# Calculate the average tip percentage for each combination of sex and smoker.

tips.groupBy('sex').pivot('smoker').agg(mean('tip_percentage')).show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941035|
|  Male|0.1606687151291298| 0.1527711752024851|
+------+------------------+-------------------+



In [173]:
from vega_datasets import data
seattle = spark.createDataFrame(data.seattle_weather())

In [174]:
seattle.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|     6.1|    -1.7| 1.9|    

In [190]:
# Convert the temperatures to farenheight.

seattle = seattle.withColumn('fahrenheit', ((seattle.temp_max*9)/5) + 32)
seattle.show()

+-------------------+-------------+--------+--------+----+-------+------------------+-------------------+
|               date|precipitation|temp_max|temp_min|wind|weather|        fahrenheit|                 dt|
+-------------------+-------------+--------+--------+----+-------+------------------+-------------------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|             55.04|2012-01-01 00:00:00|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|             51.08|2012-01-02 00:00:00|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|             53.06|2012-01-03 00:00:00|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|             53.96|2012-01-04 00:00:00|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|             48.02|2012-01-05 00:00:00|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|             39.92|2012-01-06 00:00:00|
|2012-01-07 00:00:00|          0.0|     7.2|  

In [191]:
from pyspark.sql.functions import to_timestamp

In [192]:
seattle.show()

+-------------------+-------------+--------+--------+----+-------+------------------+-------------------+
|               date|precipitation|temp_max|temp_min|wind|weather|        fahrenheit|                 dt|
+-------------------+-------------+--------+--------+----+-------+------------------+-------------------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|             55.04|2012-01-01 00:00:00|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|             51.08|2012-01-02 00:00:00|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|             53.06|2012-01-03 00:00:00|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|             53.96|2012-01-04 00:00:00|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|             48.02|2012-01-05 00:00:00|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|             39.92|2012-01-06 00:00:00|
|2012-01-07 00:00:00|          0.0|     7.2|  

In [193]:
# Which month has the most rain, on average?
# November

from pyspark.sql.functions import hour, mean, month, year

monthly = seattle.groupBy(month("date").alias("month")).agg(mean("precipitation").alias("mean_prec"))
monthly.sort(desc('mean_prec')).show()

+-----+-------------------+
|month|          mean_prec|
+-----+-------------------+
|   11|  5.354166666666667|
|   12|  5.021774193548388|
|    3|  4.888709677419355|
|   10|  4.059677419354839|
|    1| 3.7580645161290316|
|    2|  3.734513274336283|
|    4|  3.128333333333333|
|    9| 1.9624999999999997|
|    5| 1.6733870967741935|
|    8| 1.3201612903225806|
|    6| 1.1075000000000002|
|    7|0.38870967741935486|
+-----+-------------------+



In [194]:
# Which year was the windiest?
# 2012

yearly = seattle.groupBy(year("date").alias("year")).agg(mean("wind").alias("mean_wind_speed"))
yearly.sort(desc('mean_wind_speed')).show()

+----+------------------+
|year|   mean_wind_speed|
+----+------------------+
|2012| 3.400819672131147|
|2014|3.3876712328767136|
|2015|  3.15972602739726|
|2013|3.0158904109589044|
+----+------------------+



In [202]:
# What is the most frequent type of weather in January?

seattle.where(month("date") == '1').groupBy('weather').count().sort(desc('count')).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [228]:
# What is the average high and low temperature on sunny days in July in 2013 and 2014?

july = seattle.where(month('date') == '7')
july_13_14 = july.filter((year('date') == 2013) | (year('date') == 2014))
july_13_14.where(july.weather == 'sun').agg(mean('temp_min')).show()

+-----------------+
|    avg(temp_min)|
+-----------------+
|14.18269230769231|
+-----------------+



In [229]:
july_13_14.where(july.weather == 'sun').agg(mean('temp_max')).show()

+------------------+
|     avg(temp_max)|
+------------------+
|26.828846153846158|
+------------------+



In [249]:
# What percentage of days were rainy in q3 of 2015?

q3 = seattle.filter((month('date') == '7') | (month('date') == '8') | (month('date') == '9'))
q3_2015 = q3.where(year('date') == 2015)
q3_2015.filter((q3_2015.weather == 'rain') | (q3_2015.weather == 'drizzle')).count() / q3_2015.count()
(5+2)/(5+2+64+21)

0.07608695652173914

In [232]:
# For each year, find what percentage of days it rained (had non-zero precipitation).

seattle_prec = seattle.where(seattle.precipitation > 0)
seattle_no_prec = seattle.where(seattle.precipitation == 0)

In [253]:
seattle_no_prec.groupBy(year('date')).count().show()
seattle_prec.groupBy(year('date')).count().show()

+----------+-----+
|year(date)|count|
+----------+-----+
|      2015|  221|
|      2013|  213|
|      2014|  215|
|      2012|  189|
+----------+-----+

+----------+-----+
|year(date)|count|
+----------+-----+
|      2015|  144|
|      2013|  152|
|      2014|  150|
|      2012|  177|
+----------+-----+

