In [1]:
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql.functions import col, expr

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)

import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [4]:
df = spark.createDataFrame(pandas_dataframe)

In [5]:
df

DataFrame[n: bigint, group: string]

In [6]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [7]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [8]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [9]:
df.group

Column<b'group'>

In [10]:
type(df.group)

pyspark.sql.column.Column

In [11]:
df.select(df.group)

DataFrame[group: string]

In [12]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [13]:
df.select(df.n).show()

+---+
|  n|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [14]:
df.select(df.n + 1).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [15]:
df.n + 1

Column<b'(n + 1)'>

In [16]:
n_incremented = df.n + 1

In [17]:
n_incremented

Column<b'(n + 1)'>

In [18]:
df.select(n_incremented).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [19]:
df.select(n_incremented).describe().show()

+-------+-----------------+
|summary|          (n + 1)|
+-------+-----------------+
|  count|               20|
|   mean|             10.5|
| stddev|5.916079783099616|
|    min|                1|
|    max|               20|
+-------+-----------------+



In [20]:
df.select(n_incremented).describe().show()

+-------+-----------------+
|summary|          (n + 1)|
+-------+-----------------+
|  count|               20|
|   mean|             10.5|
| stddev|5.916079783099616|
|    min|                1|
|    max|               20|
+-------+-----------------+



In [21]:
df.describe().select('n', 'summary').show()

+-----------------+-------+
|                n|summary|
+-----------------+-------+
|               20|  count|
|              9.5|   mean|
|5.916079783099616| stddev|
|                0|    min|
|               19|    max|
+-----------------+-------+



In [22]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [23]:
df.select('n').describe().show()

+-------+-----------------+
|summary|                n|
+-------+-----------------+
|  count|               20|
|   mean|              9.5|
| stddev|5.916079783099616|
|    min|                0|
|    max|               19|
+-------+-----------------+



In [24]:
df.select('group').show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [25]:
mpg = spark.createDataFrame(data('mpg'))

In [26]:
mpg.select(mpg.model, 'manufacturer', mpg.hwy).show()

+------------------+------------+---+
|             model|manufacturer|hwy|
+------------------+------------+---+
|                a4|        audi| 29|
|                a4|        audi| 29|
|                a4|        audi| 31|
|                a4|        audi| 30|
|                a4|        audi| 26|
|                a4|        audi| 26|
|                a4|        audi| 27|
|        a4 quattro|        audi| 26|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 28|
|        a4 quattro|        audi| 27|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a6 quattro|        audi| 24|
|        a6 quattro|        audi| 25|
|        a6 quattro|        audi| 23|
|c1500 suburban 2wd|   chevrolet| 20|
|c1500 suburban 2wd|   chevrolet| 15|
+------------------+------------+---+
only showing top 20 rows



In [27]:
mpg.hwy

Column<b'hwy'>

In [28]:
mpg.select(mpg.model, 'manufacturer', mpg.hwy.alias('highway_mileage')).show()

+------------------+------------+---------------+
|             model|manufacturer|highway_mileage|
+------------------+------------+---------------+
|                a4|        audi|             29|
|                a4|        audi|             29|
|                a4|        audi|             31|
|                a4|        audi|             30|
|                a4|        audi|             26|
|                a4|        audi|             26|
|                a4|        audi|             27|
|        a4 quattro|        audi|             26|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             28|
|        a4 quattro|        audi|             27|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a6 quattro|        audi|             24|
|        a6 quattro|        audi|             25|


In [29]:
# Average of city and highway
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [30]:
avg_mileage_column = mpg.select((mpg.cty + mpg.hwy) / 2).alias('avg_mileage')

In [31]:
avg_mileage_column.show()

+-----------------+
|((cty + hwy) / 2)|
+-----------------+
|             23.5|
|             25.0|
|             25.5|
|             25.5|
|             21.0|
|             22.0|
|             22.5|
|             22.0|
|             20.5|
|             24.0|
|             23.0|
|             20.0|
|             21.0|
|             21.0|
|             20.0|
|             19.5|
|             21.0|
|             19.5|
|             17.0|
|             13.0|
+-----------------+
only showing top 20 rows



In [32]:
mpg.select('*').show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [33]:
from pyspark.sql.functions import col, expr

col

<function pyspark.sql.functions._create_function.<locals>._(col)>

In [34]:
col('mpg')

Column<b'mpg'>

In [35]:
mpg.select(col('hwy')).show()

+---+
|hwy|
+---+
| 29|
| 29|
| 31|
| 30|
| 26|
| 26|
| 27|
| 26|
| 25|
| 28|
| 27|
| 25|
| 25|
| 25|
| 25|
| 24|
| 25|
| 23|
| 20|
| 15|
+---+
only showing top 20 rows



In [36]:
test=col('hwy') * 2

In [37]:
mpg.select(col('hwy').alias('highway_mileage')).show(4)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
+---------------+
only showing top 4 rows



In [38]:
from pyspark.sql.functions import expr

In [39]:
mpg.select(expr('hwy AS highway_mileage')).show(4)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
+---------------+
only showing top 4 rows



In [40]:
mpg.select(expr('(hwy + cty) / 2 AS average_mileage')).show(3)

+---------------+
|average_mileage|
+---------------+
|           23.5|
|           25.0|
|           25.5|
+---------------+
only showing top 3 rows



In [41]:
mpg.select(expr('AVG(hwy)')).show()

+-----------------+
|         avg(hwy)|
+-----------------+
|23.44017094017094|
+-----------------+



In [42]:
mpg.createOrReplaceTempView('mpg')

In [43]:
mpg2 = spark.sql('''
SELECT 
    cty AS city,
    hwy AS highway,
    (cty + hwy) / 2 AS avg_mileage
FROM mpg
WHERE class = 'compact'
''')

In [44]:
mpg2.show(1)

+----+-------+-----------+
|city|highway|avg_mileage|
+----+-------+-----------+
|  18|     29|       23.5|
+----+-------+-----------+
only showing top 1 row



In [45]:
mpg2.select('city', 'highway').show(3)

+----+-------+
|city|highway|
+----+-------+
|  18|     29|
|  21|     29|
|  20|     31|
+----+-------+
only showing top 3 rows



In [46]:
mpg.select('class').describe().show(3)

+-------+-----+
|summary|class|
+-------+-----+
|  count|  234|
|   mean| null|
| stddev| null|
+-------+-----+
only showing top 3 rows



In [47]:
mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

In [48]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [49]:
mpg.cyl.cast('string')

Column<b'CAST(cyl AS STRING)'>

In [50]:
mpg.select('*', mpg.cyl.cast('string').alias('cyl_string')).show(4)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|cyl_string|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|         4|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|         4|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|         4|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|         4|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
only showing top 4 rows



In [51]:
spark.sql('SELECT *, CAST(cyl AS STRING) AS cyl_string FROM mpg').show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|cyl_string|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|         4|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|         4|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|         4|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+
only showing top 3 rows



In [52]:
mpg.select(mpg.manufacturer.cast('double')).show(3)

+------------+
|manufacturer|
+------------+
|        null|
|        null|
|        null|
+------------+
only showing top 3 rows



In [53]:
min([1,2,3]), max([1,2,3])

(1, 3)

In [54]:
from pyspark.sql.functions import min, max

In [55]:
#mpg.show(3)

In [56]:
mpg.select(min(mpg.cyl).alias('min_cyl'), max(mpg.hwy)).show()

+-------+--------+
|min_cyl|max(hwy)|
+-------+--------+
|      4|      44|
+-------+--------+



In [57]:
spark.sql('SELECT MIN(cyl) AS min_cyl, MAX(hwy) FROM mpg').show()

+-------+--------+
|min_cyl|max(hwy)|
+-------+--------+
|      4|      44|
+-------+--------+



In [58]:
# Lesson day 2

In [59]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [60]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227-6678",
            ]
        }
    )
)

textdf.show(truncate=False)


+---------------------------------------------+
|address                                      |
+---------------------------------------------+
|600 Navarro St ste 600, San Antonio, TX 78205|
|3130 Broadway St, San Antonio, TX 78209      |
|303 Pearl Pkwy, San Antonio, TX 78215        |
|1255 SW Loop 410, San Antonio, TX 78227-6678 |
+---------------------------------------------+



In [61]:
textdf.select(
    "address",
    regexp_extract('address', r'^(\d+)', 1).alias('street_number'),
    regexp_extract('address', r'^\d+\s([\w\s]+?),', 1).alias('street'),
    regexp_extract('address', r',(\s.\w+\s\w+,*\s\w*)', 1).alias('city_state'),
    regexp_extract('address', r'(\d+(-\d+)?)$', 1).alias('zip')
).show(truncate = False)

+---------------------------------------------+-------------+------------------+----------------+----------+
|address                                      |street_number|street            |city_state      |zip       |
+---------------------------------------------+-------------+------------------+----------------+----------+
|600 Navarro St ste 600, San Antonio, TX 78205|600          |Navarro St ste 600| San Antonio, TX|78205     |
|3130 Broadway St, San Antonio, TX 78209      |3130         |Broadway St       | San Antonio, TX|78209     |
|303 Pearl Pkwy, San Antonio, TX 78215        |303          |Pearl Pkwy        | San Antonio, TX|78215     |
|1255 SW Loop 410, San Antonio, TX 78227-6678 |1255         |SW Loop 410       | San Antonio, TX|78227-6678|
+---------------------------------------------+-------------+------------------+----------------+----------+



In [62]:
regexp_extract?

In [63]:
# Filtering

In [64]:
mpg = data('mpg')

In [65]:
mpg = spark.createDataFrame(mpg)

In [66]:
four_cylinder_filter = mpg.filter(mpg.cyl == 4)

In [67]:
mpg.createOrReplaceTempView('mpg')

In [68]:
spark.sql('''
    SELECT DISTINCT class
    FROM mpg
''').show()

+----------+
|     class|
+----------+
|subcompact|
|   compact|
|   minivan|
|       suv|
|   midsize|
|    pickup|
|   2seater|
+----------+



In [69]:
from pyspark.sql.functions import col

In [70]:
mpg.filter(mpg.cyl == 4).where(mpg["class"] == 'subcompact').show(4)

+------------+-----+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+-----+-----+----+---+----------+---+---+---+---+----------+
|       honda|civic|  1.6|1999|  4|manual(m5)|  f| 28| 33|  r|subcompact|
|       honda|civic|  1.6|1999|  4|  auto(l4)|  f| 24| 32|  r|subcompact|
|       honda|civic|  1.6|1999|  4|manual(m5)|  f| 25| 32|  r|subcompact|
|       honda|civic|  1.6|1999|  4|manual(m5)|  f| 23| 29|  p|subcompact|
+------------+-----+-----+----+---+----------+---+---+---+---+----------+
only showing top 4 rows



In [71]:
mpg.filter(mpg['class'] == 'minivan').where(mpg.cyl == 6).show(4)

+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|       dodge|caravan 2wd|  3.0|1999|  6|auto(l4)|  f| 17| 24|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|2008|  6|auto(l4)|  f| 17| 24|  r|minivan|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
only showing top 4 rows



In [72]:
from pyspark.sql.functions import when

In [73]:
mpg.select(mpg.hwy, when(mpg.hwy > 25, 'good_mileage').alias('mpg_desc')).show(9)

+---+------------+
|hwy|    mpg_desc|
+---+------------+
| 29|good_mileage|
| 29|good_mileage|
| 31|good_mileage|
| 30|good_mileage|
| 26|good_mileage|
| 26|good_mileage|
| 27|good_mileage|
| 26|good_mileage|
| 25|        null|
+---+------------+
only showing top 9 rows



In [74]:
mpg.select(
    mpg.hwy,
    when(mpg.hwy > 25, 'good_mileage')
    .otherwise('bad_mileage')
    .alias("mpg_desc"),
).show(9)

+---+------------+
|hwy|    mpg_desc|
+---+------------+
| 29|good_mileage|
| 29|good_mileage|
| 31|good_mileage|
| 30|good_mileage|
| 26|good_mileage|
| 26|good_mileage|
| 27|good_mileage|
| 26|good_mileage|
| 25| bad_mileage|
+---+------------+
only showing top 9 rows



In [75]:
mpg.show(2)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 2 rows



In [76]:
mpg.select(
    mpg.displ,
    when(mpg.displ < 2, 'small')
    .when(mpg.displ < 3, 'medium')
    .otherwise('large')
    .alias('engine_size')
).show(8)

+-----+-----------+
|displ|engine_size|
+-----+-----------+
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  1.8|      small|
+-----+-----------+
only showing top 8 rows



In [77]:
# Mini-exercises

In [78]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [79]:
df = spark.createDataFrame(pandas_dataframe)

In [80]:
df.show(3), df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



(None, None)

In [81]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885216| null|
| stddev|0.8905322898155364| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [82]:
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [83]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [84]:
practice = df.select(df.group, df.n.alias('a_numeric_value'))

In [85]:
practice.show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



In [86]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)
pandas_dataframe.head(3)

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False


In [87]:
df_2 = spark.createDataFrame(pandas_dataframe)

In [88]:
df_2.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [89]:
df_2.select(df_2.n + 4).show(3)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
+------------------+
only showing top 3 rows



In [90]:
df_2.select(df_2.n - 5).show(3)

+------------------+
|           (n - 5)|
+------------------+
|-5.712390662050588|
|-4.246233621340297|
|-5.044503078338053|
+------------------+
only showing top 3 rows



In [91]:
df_2.select(col('n') - 5).show(3)

+------------------+
|           (n - 5)|
+------------------+
|-5.712390662050588|
|-4.246233621340297|
|-5.044503078338053|
+------------------+
only showing top 3 rows



In [92]:
df_2.select(df_2.n * 2, df_2.n).show(3)

+--------------------+--------------------+
|             (n * 2)|                   n|
+--------------------+--------------------+
|  -1.424781324101176|  -0.712390662050588|
|   1.507532757319406|   0.753766378659703|
|-0.08900615667610691|-0.04450307833805...|
+--------------------+--------------------+
only showing top 3 rows



In [93]:
df_2.select(col('n') * 2, col('n')).show(3)

+--------------------+--------------------+
|             (n * 2)|                   n|
+--------------------+--------------------+
|  -1.424781324101176|  -0.712390662050588|
|   1.507532757319406|   0.753766378659703|
|-0.08900615667610691|-0.04450307833805...|
+--------------------+--------------------+
only showing top 3 rows



In [94]:
df.select(col('n'), col('n') * -1).alias('n2').show(4)

+--------------------+--------------------+
|                   n|            (n * -1)|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



In [95]:
n2 = col('n') * -1
df.select('n', n2)

DataFrame[n: double, (n * -1): double]

In [96]:
df.select('n', (col('n') * -1).alias('n2')).show(5)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
|  1.3451017084510097| -1.3451017084510097|
+--------------------+--------------------+
only showing top 5 rows



In [97]:
n3 = (col('n') ** 2).alias('n3')
df.select('n', n2, n3).show(3)

+--------------------+--------------------+--------------------+
|                   n|            (n * -1)|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [98]:
df.select('n', (col('n') * -1).alias('n2'), (col('n') ** 2).alias('n3')).show(3)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [99]:
df.group + df.abool

Column<b'(group + abool)'>

In [100]:
#df.select(df.group + df.abool).show()

In [101]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [102]:
df_3 = spark.createDataFrame(pandas_dataframe)

In [103]:
df_3.createOrReplaceTempView("my_df")

In [104]:
spark.sql('SELECT * FROM my_df').show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [105]:
spark.sql('''
    SELECT n, abool
    FROM my_df
    ''').show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



In [106]:
spark.sql('''
    SELECT n, group AS g
    FROM my_df
    ''').show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



In [107]:
spark.sql('''
    SELECT n, n/2 AS n2, n-1 AS n3
    FROM my_df
    ''').show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
+--------------------+--------------------+--------------------+
only showing top 7 rows



In [108]:
#spark.sql(''' SELET * FROM my_df''')

In [109]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [110]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [111]:
df.abool.cast('int')

Column<b'CAST(abool AS INT)'>

In [112]:
df.select(df.abool.cast('int')).show(9)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
+-----+
only showing top 9 rows



In [113]:
df.select(df.abool, df.abool.cast('int')).show(9)

+-----+-----+
|abool|abool|
+-----+-----+
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
|false|    0|
| true|    1|
+-----+-----+
only showing top 9 rows



In [114]:
df.select(df.group, df.group.cast('int')).show(9)

+-----+-----+
|group|group|
+-----+-----+
|    z| null|
|    x| null|
|    z| null|
|    y| null|
|    z| null|
|    y| null|
|    z| null|
|    x| null|
|    z| null|
+-----+-----+
only showing top 9 rows



In [115]:
df.select(df.n, df.n.cast('int')).show(9)

+--------------------+---+
|                   n|  n|
+--------------------+---+
|  -0.712390662050588|  0|
|   0.753766378659703|  0|
|-0.04450307833805...|  0|
| 0.45181233874578974|  0|
|  1.3451017084510097|  1|
|  0.5323378882945463|  0|
|  1.3501878997225267|  1|
|  0.8612113741693206|  0|
|  1.4786857374358966|  1|
+--------------------+---+
only showing top 9 rows



In [116]:
df.select(df.abool, df.abool.cast('string')).show(9)

+-----+-----+
|abool|abool|
+-----+-----+
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
| true| true|
+-----+-----+
only showing top 9 rows



In [117]:
df.select(df.abool, df.abool.cast('string')).dtypes

[('abool', 'boolean'), ('abool', 'string')]

In [118]:
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean


In [119]:
df.select(max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [120]:
df.select(min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [121]:
df.select(avg(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885216|
+------------------+



In [122]:
from pyspark.sql.functions import lit, round

df.select(concat(lit("Group: "), df.group,)).show(7)

+----------------------+
|concat(Group: , group)|
+----------------------+
|              Group: z|
|              Group: x|
|              Group: z|
|              Group: y|
|              Group: z|
|              Group: y|
|              Group: z|
+----------------------+
only showing top 7 rows



In [123]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [124]:
df.select(concat(df.group, lit(' : '), df.n)).show(7)

+---------------------+
|concat(group,  : , n)|
+---------------------+
| z : -0.7123906620...|
| x : 0.75376637865...|
| z : -0.0445030783...|
| y : 0.45181233874...|
| z : 1.34510170845...|
| y : 0.53233788829...|
| z : 1.35018789972...|
+---------------------+
only showing top 7 rows



In [125]:
df.select(concat(lit('Group: '), 'group'), concat('group', lit(': '), round(df.n, 3))).show(7)

+----------------------+------------------------------+
|concat(Group: , group)|concat(group, : , round(n, 3))|
+----------------------+------------------------------+
|              Group: z|                     z: -0.712|
|              Group: x|                      x: 0.754|
|              Group: z|                     z: -0.045|
|              Group: y|                      y: 0.452|
|              Group: z|                      z: 1.345|
|              Group: y|                      y: 0.532|
|              Group: z|                       z: 1.35|
+----------------------+------------------------------+
only showing top 7 rows



In [126]:
# __builtins__.min or __builtins__.print

In [140]:
df = spark.createDataFrame(pandas_dataframe)
df.createOrReplaceTempView("my_df")
my_df = spark.sql('SELECT * FROM my_df')

In [141]:
my_df.filter(my_df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



In [142]:
my_df.filter(my_df.abool == 'false').show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [164]:
my_df.filter(my_df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [167]:
my_df.filter(my_df.n > 0).show()

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
| 0.8612113741693206|    x|false|
| 1.4786857374358966|    z| true|
| 0.5628467852810314|    y| true|
| 0.9137407048596775|    y|false|
|0.31735092273633597|    x|false|
|0.12730328020698067|    z|false|
| 2.1503829673811126|    y| true|
| 0.6062886568962988|    x|false|
+-------------------+-----+-----+



In [170]:
my_df.filter(my_df.abool == 'true').where(my_df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



In [180]:
my_df.filter((my_df.abool == 'true') | (my_df.group == 'z')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [205]:
my_df.filter(my_df.abool == 'false').where(my_df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



In [207]:
my_df.filter((my_df.abool == 'false') | (my_df.n < 1)).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [212]:
my_df.select(
    my_df.abool,
    (
        when(my_df.abool == 'true', 'It is true')
        .otherwise('It is false')
        .alias('True or False')
    ),
).show()

+-----+-------------+
|abool|True or False|
+-----+-------------+
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
|false|  It is false|
| true|   It is true|
| true|   It is true|
|false|  It is false|
|false|  It is false|
| true|   It is true|
| true|   It is true|
|false|  It is false|
|false|  It is false|
|false|  It is false|
| true|   It is true|
|false|  It is false|
| true|   It is true|
+-----+-------------+



In [209]:
my_df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [215]:
my_df.select(
    my_df.n,
    (
        when(my_df.n < 0, "0")
        .otherwise(round(my_df.n, 3))
        .alias('0 or n')
    ),
).show()

+--------------------+------+
|                   n|0 or n|
+--------------------+------+
|  -0.712390662050588|     0|
|   0.753766378659703| 0.754|
|-0.04450307833805...|     0|
| 0.45181233874578974| 0.452|
|  1.3451017084510097| 1.345|
|  0.5323378882945463| 0.532|
|  1.3501878997225267|  1.35|
|  0.8612113741693206| 0.861|
|  1.4786857374358966| 1.479|
| -1.0453771305385342|     0|
| -0.7889890249515489|     0|
|  -1.261605945319069|     0|
|  0.5628467852810314| 0.563|
|-0.24332625188556253|     0|
|  0.9137407048596775| 0.914|
| 0.31735092273633597| 0.317|
| 0.12730328020698067| 0.127|
|  2.1503829673811126|  2.15|
|  0.6062886568962988| 0.606|
|-0.02677164998644...|     0|
+--------------------+------+

