In [1]:
# Create a spark session
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
# Looking at the object
spark

In [3]:
# Create a pandas dataframe

import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

### Convert the pandas dataframe to a spark dataframe. 

In [4]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Show the first 3 rows of the dataframe.

In [5]:
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



### Show the first 7 rows of the dataframe.

In [6]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



### Output the shape of the dataframe

In [7]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



### Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [8]:
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



### Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [9]:
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



### Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [10]:
df.select(df.group, df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



### Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [11]:
df.select(df.group, df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



## 2. Column Manipulation

In [12]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Use .select to add 4 to the n column. Show the results.

In [13]:
df.select(df.n + 4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



### Subtract 5 from the n column and view the results.

In [14]:
df.select(df.n - 5).show(5)

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
+-------------------+
only showing top 5 rows



### Multiply the n column by 2. View the results along with the original numbers.

In [15]:
df.select(df.n * 2).show(5)

+--------------------+
|             (n * 2)|
+--------------------+
|  -1.424781324101176|
|   1.507532757319406|
|-0.08900615667610691|
|  0.9036246774915795|
|  2.6902034169020195|
+--------------------+
only showing top 5 rows



### Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

In [16]:
n2 = (df.n * -1).alias('n2')
df = df.select('*', n2)
df.show(10)

+--------------------+-----+-----+--------------------+
|                   n|group|abool|                  n2|
+--------------------+-----+-----+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|
|   0.753766378659703|    x|false|  -0.753766378659703|
|-0.04450307833805...|    z|false|0.044503078338053455|
| 0.45181233874578974|    y|false|-0.45181233874578974|
|  1.3451017084510097|    z|false| -1.3451017084510097|
|  0.5323378882945463|    y|false| -0.5323378882945463|
|  1.3501878997225267|    z|false| -1.3501878997225267|
|  0.8612113741693206|    x|false| -0.8612113741693206|
|  1.4786857374358966|    z| true| -1.4786857374358966|
| -1.0453771305385342|    y| true|  1.0453771305385342|
+--------------------+-----+-----+--------------------+
only showing top 10 rows



### Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

In [17]:
n3 = (df.n ** 2).alias('n3')
df = df.select('*', n3)
df.show(5)

+--------------------+-----+-----+--------------------+--------------------+
|                   n|group|abool|                  n2|                  n3|
+--------------------+-----+-----+--------------------+--------------------+
|  -0.712390662050588|    z|false|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|    x|false|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|    z|false|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|    y|false|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|    z|false| -1.3451017084510097|  1.8092986060778251|
+--------------------+-----+-----+--------------------+--------------------+
only showing top 5 rows



## Type casting

### Use the starter code above to re-create a spark dataframe.

In [18]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Use .printSchema to view the datatypes in your dataframe.

In [19]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean]>

### Use .dtypes to view the datatypes in your dataframe.

In [20]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

### What is the difference between the two code samples below? 
- df.abool.cast('int') 
- df.select(df.abool.cast('int')).show()

In [21]:
# here it is being 'lazy'
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [22]:
# here it is being explicit
df.select(df.abool.cast('int')).show()

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    1|
|    1|
|    0|
|    0|
|    0|
|    1|
|    0|
|    1|
+-----+



### Use .select and .cast to convert the abool column to an integer type. View the results.

In [23]:
df.select(df.abool.cast('int')).show(10)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    1|
|    1|
+-----+
only showing top 10 rows



### Convert the group column to a integer data type and view the results. What happens?

In [24]:
df.select(df.group.cast('int')).show(10)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 10 rows



### Convert the n column to a integer data type and view the results. What happens?

In [25]:
df.select(df.n.cast('int')).show(10)

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
|  0|
|  1|
|  0|
|  1|
| -1|
+---+
only showing top 10 rows



### Convert the abool column to a string data type and view the results. What happens?

In [26]:
df.select(df.abool.cast('string')).show(10)

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
|false|
|false|
|false|
| true|
| true|
+-----+
only showing top 10 rows



## 4. Built-in Functions

### Use the starter code above to re-create a spark dataframe.

In [27]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Import the necessary functions from pyspark.sql.functions

In [28]:
from pyspark.sql.functions import min, avg

### Find the lowest n value.

In [29]:
df.select(min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



### Find the average n value.

In [30]:
df.select(avg(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



## 5. When / Otherwise

### Use the starter code above to re-create a spark dataframe.

In [31]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

## 6. Filter / Where

In [32]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Use .filter or .where to select just the rows where the group is y and view the results.

In [33]:
df.where(df.group == 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
| -1.0453771305385342|    y| true|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
+--------------------+-----+-----+



### Select just the columns where the abool column is false and view the results.

In [34]:
df.where(df.abool == 'false').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



### Find the columns where the group column is not y.

In [35]:
df.where(df.group != 'y').show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



### Find the columns where n is positive.

In [36]:
df.where(df.n % 2 == 0).show()

+---+-----+-----+
|  n|group|abool|
+---+-----+-----+
+---+-----+-----+



### Find the columns where abool is true AND the group column is z.

In [37]:
df.where(df.abool == 'true').where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



### Find the columns where abool is true OR the group column is z.

In [38]:
df.where(df.abool == 'true').where(df.group == 'z').show()

+------------------+-----+-----+
|                 n|group|abool|
+------------------+-----+-----+
|1.4786857374358966|    z| true|
+------------------+-----+-----+



### Find the columns where abool is false and n is less than 1

In [39]:
df.where(df.abool == 'false').where(df.n < 1).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.8612113741693206|    x|false|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  0.6062886568962988|    x|false|
+--------------------+-----+-----+



## 7. Sorting

In [40]:
# Convert to spark df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

### Sort by the n value.

In [41]:
df.sort(df.n).show(10)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -0.712390662050588|    z|false|
|-0.24332625188556253|    y| true|
|-0.04450307833805...|    z|false|
|-0.02677164998644...|    x| true|
| 0.12730328020698067|    z|false|
| 0.31735092273633597|    x|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 10 rows



### Sort by the group value, both ascending and descending.

In [42]:
# Ascending
df.sort(df.group).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
|  0.8612113741693206|    x|false|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
| -1.0453771305385342|    y| true|
|  0.5628467852810314|    y| true|
|  0.5323378882945463|    y|false|
|  2.1503829673811126|    y| true|
|  -1.261605945319069|    y|false|
|  0.9137407048596775|    y|false|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  -0.712390662050588|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+



In [45]:
from pyspark.sql.functions import asc, desc

In [46]:
# Descending
df.sort(desc('group')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
| 0.12730328020698067|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
|  1.3451017084510097|    z|false|
|-0.04450307833805...|    z|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
| 0.31735092273633597|    x|false|
|-0.02677164998644...|    x| true|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
| -0.7889890249515489|    x|false|
|  0.8612113741693206|    x|false|
+--------------------+-----+-----+



### Sort by the group value first, then, within each group, sort by n value.

In [47]:
df.sort(asc('group'), asc('n')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
|-0.02677164998644...|    x| true|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.5628467852810314|    y| true|
|  0.9137407048596775|    y|false|
|  2.1503829673811126|    y| true|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+



### Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

In [48]:
df.sort(asc('abool'), asc('group'), asc('n')).show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.7889890249515489|    x|false|
| 0.31735092273633597|    x|false|
|  0.6062886568962988|    x|false|
|   0.753766378659703|    x|false|
|  0.8612113741693206|    x|false|
|  -1.261605945319069|    y|false|
| 0.45181233874578974|    y|false|
|  0.5323378882945463|    y|false|
|  0.9137407048596775|    y|false|
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
| 0.12730328020698067|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
|-0.02677164998644...|    x| true|
| -1.0453771305385342|    y| true|
|-0.24332625188556253|    y| true|
|  0.5628467852810314|    y| true|
|  2.1503829673811126|    y| true|
|  1.4786857374358966|    z| true|
+--------------------+-----+-----+

