# Spark API Mini Exercises

In [31]:
import pandas as pd
import numpy as np
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

np.random.seed(13)

Copy the code below to create a pandas dataframe with 20 rows and 3 columns:

In [32]:
pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

1. Spark Dataframe Basics

    1. Use the starter code above to create a pandas dataframe.
    1. Convert the pandas dataframe to a spark dataframe. From this point
       forward, do all of your work with the spark dataframe, not the pandas
       dataframe.
    1. Show the first 3 rows of the dataframe.
    1. Show the first 7 rows of the dataframe.
    1. View a summary of the data using `.describe`.
    1. Use `.select` to create a new dataframe with just the `n` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with just the `group` and `abool`
       columns. View the first 5 rows of this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `abool` column renamed to `a_boolean_value`. Show the first 3 rows of
       this dataframe.
    1. Use `.select` to create a new dataframe with the `group` column and the
       `n` column renamed to `a_numeric_value`. Show the first 6 rows of this
       dataframe.

In [33]:
# A. Use the starter code above to create a pandas dataframe.
pandas_dataframe

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False
5,0.532338,y,False
6,1.350188,z,False
7,0.861211,x,False
8,1.478686,z,True
9,-1.045377,y,True


In [34]:
# B. Convert the pandas dataframe to a spark dataframe. From this point forward, 
# do all of your work with the spark dataframe, not the pandasdataframe.

df = spark.createDataFrame(pandas_dataframe)
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [35]:
# C. Show the first 3 rows of the dataframe.

df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [36]:
# D. Show the first 7 rows of the dataframe.

df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [37]:
# E. View a summary of the data using `.describe`.

df.describe()

DataFrame[summary: string, n: string, group: string]

In [38]:
# F. Use `.select` to create a new dataframe with just the `n` and `abool` columns. 
# View the first 5 rows of this dataframe.

df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [39]:
# G. Use `.select` to create a new dataframe with just the `group` and `abool` columns. 
# View the first 5 rows of this dataframe.

df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [40]:
# H. Use `.select` to create a new dataframe with the `group` column and the `abool` 
# column renamed to `a_boolean_value`. Show the first 3 rows of this dataframe.
col = df.abool
new_df = df.select('group', col.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [41]:
# I. Use `.select` to create a new dataframe with the `group` column and the `n` column 
# renamed to `a_numeric_value`. Show the first 6 rows of this dataframe.

col = df.n

newer_df = df.select('group', col.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



1. Column Manipulation

    1. Use the starter code above to re-create a spark dataframe. Store the
       spark dataframe in a varaible named `df`
    1. Use `.select` to add 4 to the `n` column. Show the results.
    1. Subtract 5 from the `n` column and view the results.
    1. Multiply the `n` column by 2. View the results along with the original
       numbers.
    1. Add a new column named `n2` that is the `n` value multiplied by -1. Show
       the first 4 rows of your dataframe. You should see the original `n` value
       as well as `n2`.
    1. Add a new column named `n3` that is the n value squared. Show the first 5
       rows of your dataframe. You should see both `n`, `n2`, and `n3`.
    1. What happens when you run the code below?

        ```python
        df.group + df.abool
        ```
        
    1. What happens when you run the code below? What is the difference between
       this and the previous code sample?

        ```python
        df.select(df.group + df.abool)
        ```
        
    1. Try adding various other columns together. What are the results of
       combining the different data types?

In [42]:
# A. Use the starter code above to re-create a spark dataframe. 
# Store the spark dataframe in a varaible named df

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

df = spark.createDataFrame(pandas_dataframe)
df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
| -0.8850620992868307|    x|false|
| 0.07272674611277782|    x| true|
|   -0.82751910119974|    x|false|
|  -0.591550921883219|    y|false|
|  -2.186215625579764|    y| true|
| -1.4304503608169532|    y| true|
|0.001182616633274...|    y| true|
|-0.15241870550020373|    z| true|
|  1.4418010391482912|    z|false|
|-0.18492174250837645|    x| true|
| -0.2298977614525365|    x|false|
| -0.2633773007599446|    y| true|
|-0.08286224131163808|    x| true|
|-0.38505688002448574|    x|false|
|-0.10105117176246801|    z| true|
| 0.33115422310589987|    y|false|
| 0.04329942581647413|    z|false|
|-0.44997653600849435|    z| true|
|  0.3471981374467132|    y|false|
|  1.4591000886192282|    z| true|
+--------------------+-----+-----+



In [51]:
# B. Use .select to add 4 to the n column. Show the results.

n_var = (df.n.cast('float')+4)

newest_df = df.select('*', n_var.alias('n_plus_4')).show(20)

+--------------------+-----+-----+---------+
|                   n|group|abool| n_plus_4|
+--------------------+-----+-----+---------+
| -0.8850620992868307|    x|false|3.1149378|
| 0.07272674611277782|    x| true|4.0727267|
|   -0.82751910119974|    x|false|3.1724808|
|  -0.591550921883219|    y|false|3.4084492|
|  -2.186215625579764|    y| true|1.8137844|
| -1.4304503608169532|    y| true|2.5695496|
|0.001182616633274...|    y| true|4.0011826|
|-0.15241870550020373|    z| true|3.8475814|
|  1.4418010391482912|    z|false| 5.441801|
|-0.18492174250837645|    x| true|3.8150783|
| -0.2298977614525365|    x|false|3.7701023|
| -0.2633773007599446|    y| true|3.7366228|
|-0.08286224131163808|    x| true|3.9171379|
|-0.38505688002448574|    x|false| 3.614943|
|-0.10105117176246801|    z| true| 3.898949|
| 0.33115422310589987|    y|false|4.3311543|
| 0.04329942581647413|    z|false| 4.043299|
|-0.44997653600849435|    z| true|3.5500236|
|  0.3471981374467132|    y|false| 4.347198|
|  1.45910

In [52]:
# C. Subtract 5 from the n column and view the results.

n_var = (df.n.cast('float')-5)

newest_df = df.select('*', n_var.alias('n_minus_5')).show(20)

+--------------------+-----+-----+----------+
|                   n|group|abool| n_minus_5|
+--------------------+-----+-----+----------+
| -0.8850620992868307|    x|false| -5.885062|
| 0.07272674611277782|    x| true|-4.9272733|
|   -0.82751910119974|    x|false| -5.827519|
|  -0.591550921883219|    y|false| -5.591551|
|  -2.186215625579764|    y| true|-7.1862154|
| -1.4304503608169532|    y| true|-6.4304504|
|0.001182616633274...|    y| true|-4.9988174|
|-0.15241870550020373|    z| true|-5.1524186|
|  1.4418010391482912|    z|false| -3.558199|
|-0.18492174250837645|    x| true|-5.1849217|
| -0.2298977614525365|    x|false| -5.229898|
| -0.2633773007599446|    y| true| -5.263377|
|-0.08286224131163808|    x| true|-5.0828624|
|-0.38505688002448574|    x|false| -5.385057|
|-0.10105117176246801|    z| true|-5.1010513|
| 0.33115422310589987|    y|false|-4.6688457|
| 0.04329942581647413|    z|false| -4.956701|
|-0.44997653600849435|    z| true|-5.4499764|
|  0.3471981374467132|    y|false|

In [53]:
# D. Multiply the n column by 2. View the results along with the original numbers.

n_var = (df.n.cast('float')*2)

newest_df = df.select('*', n_var.alias('n_times_2')).show(20)

+--------------------+-----+-----+------------+
|                   n|group|abool|   n_times_2|
+--------------------+-----+-----+------------+
| -0.8850620992868307|    x|false|  -1.7701242|
| 0.07272674611277782|    x| true|   0.1454535|
|   -0.82751910119974|    x|false|  -1.6550382|
|  -0.591550921883219|    y|false|  -1.1831019|
|  -2.186215625579764|    y| true|  -4.3724313|
| -1.4304503608169532|    y| true|  -2.8609006|
|0.001182616633274...|    y| true|0.0023652334|
|-0.15241870550020373|    z| true|  -0.3048374|
|  1.4418010391482912|    z|false|   2.8836021|
|-0.18492174250837645|    x| true| -0.36984348|
| -0.2298977614525365|    x|false| -0.45979553|
| -0.2633773007599446|    y| true|  -0.5267546|
|-0.08286224131163808|    x| true| -0.16572449|
|-0.38505688002448574|    x|false| -0.77011377|
|-0.10105117176246801|    z| true| -0.20210235|
| 0.33115422310589987|    y|false|  0.66230845|
| 0.04329942581647413|    z|false|  0.08659885|
|-0.44997653600849435|    z| true| -0.89

In [54]:
# E. Add a new column named n2 that is the n value multiplied by -1. 
# Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

n_var = (df.n.cast('float')*-1)

newest_df = df.select('*', n_var.alias('n_2')).show(4)

+-------------------+-----+-----+-----------+
|                  n|group|abool|        n_2|
+-------------------+-----+-----+-----------+
|-0.8850620992868307|    x|false|  0.8850621|
|0.07272674611277782|    x| true|-0.07272675|
|  -0.82751910119974|    x|false|  0.8275191|
| -0.591550921883219|    y|false| 0.59155095|
+-------------------+-----+-----+-----------+
only showing top 4 rows



In [56]:
# F. Add a new column named n3 that is the n value squared. 
# Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

n2 = (df.n.cast('float')*-1)
n3 = (df.n.cast('float')**2)

f_df = df.select('*', n2.alias('n2'), n3.alias('n3')).show(5)

+-------------------+-----+-----+-----------+--------------------+
|                  n|group|abool|         n2|                  n3|
+-------------------+-----+-----+-----------+--------------------+
|-0.8850620992868307|    x|false|  0.8850621|  0.7833349182067337|
|0.07272674611277782|    x| true|-0.07272675|0.005289180003196281|
|  -0.82751910119974|    x|false|  0.8275191|  0.6847878919561232|
| -0.591550921883219|    y|false| 0.59155095| 0.34993252199230085|
| -2.186215625579764|    y| true|  2.1862156|  4.7795388207082965|
+-------------------+-----+-----+-----------+--------------------+
only showing top 5 rows



In [57]:
# G. What happens when you run the code below?

df.group + df.abool

Column<'(group + abool)'>

In [59]:
# H. What happens when you run the code below? 
# What is the difference between this and the previous code sample?

df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#422 as double) + abool#423) AS (group + abool)#595]
+- LogicalRDD [n#421, group#422, abool#423], false


In [65]:
# I. Try adding various other columns together. What are the results of combining the different data types?

df.select(df.group + df.n)

DataFrame[(group + n): double]

1. Type casting

    1. Use the starter code above to re-create a spark dataframe.

    1. Use `.printSchema` to view the datatypes in your dataframe.

    1. Use `.dtypes` to view the datatypes in your dataframe.

    1. What is the difference between the two code samples below?

        ```python
        df.abool.cast('int')
        ```

        ```python
        df.select(df.abool.cast('int')).show()
        ```

    1. Use `.select` and `.cast` to convert the `abool` column to an integer
       type. View the results.
    1. Convert the `group` column to a integer data type and view the results.
       What happens?
    1. Convert the `n` column to a integer data type and view the results. What
       happens?
    1. Convert the `abool` column to a string data type and view the results.
       What happens?