# Independent Lesson Work Through

In [1]:
# imports for Spark - not this is the 'simple' import from the previous page
# see Environment Setup for more complex import
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()
# Using .getOrCreate will only create a new spark session once. 
# Subsequent calls to that method will re-use the existing session.

In [2]:
# now create a simple pandas df for comparision to Spark
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [3]:
# conver the pandas df to a Spark Df
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [4]:
# to get Spark to actually show values
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [5]:
# Spark version of describe
df.describe()

DataFrame[summary: string, n: string, group: string]

In [6]:
# because it is Spark we have to explicitly show the describe
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [7]:
# get a more robust dataset
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)
# the (5) will show just 5 rows, change the value to see more/less

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [8]:
# compare seeing a column to pandas
mpg.hwy
# in pandas this would return a series of the data in the column, Spark returns just a column object

Column<b'hwy'>

In [9]:
# to see the data use .select and .show() together, returns 20 rows
mpg.select(mpg.hwy, mpg.cty, mpg.model).show()

+---+---+------------------+
|hwy|cty|             model|
+---+---+------------------+
| 29| 18|                a4|
| 29| 21|                a4|
| 31| 20|                a4|
| 30| 21|                a4|
| 26| 16|                a4|
| 26| 18|                a4|
| 27| 18|                a4|
| 26| 18|        a4 quattro|
| 25| 16|        a4 quattro|
| 28| 20|        a4 quattro|
| 27| 19|        a4 quattro|
| 25| 15|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 15|        a4 quattro|
| 24| 15|        a6 quattro|
| 25| 17|        a6 quattro|
| 23| 16|        a6 quattro|
| 20| 14|c1500 suburban 2wd|
| 15| 11|c1500 suburban 2wd|
+---+---+------------------+
only showing top 20 rows



In [10]:
# Our column objects support a numer of operations, including the arithmetic operators
mpg.hwy + 1

Column<b'(hwy + 1)'>

In [11]:
# To actually see this data, we'd need to select it and show the dataframe
mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



In [12]:
# Once we have a column object, we can use the .alias method to rename it:
mpg.select(mpg.hwy.alias("highway_mileage")).show(5)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



In [13]:
# we can also store column objects in variables and reference them
col1 = mpg.hwy.alias("highway_mileage")
col2 = (mpg.hwy / 2).alias("highway_mileage_halved")
mpg.select(col1, col2).show(5)

+---------------+----------------------+
|highway_mileage|highway_mileage_halved|
+---------------+----------------------+
|             29|                  14.5|
|             29|                  14.5|
|             31|                  15.5|
|             30|                  15.0|
|             26|                  13.0|
+---------------+----------------------+
only showing top 5 rows



# Exercises
Within your codeup-data-science directory, create a new repo named spark-exercises. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

Save this work in your spark-exercises repo. Then add, commit, and push your changes.

Create a jupyter notebook or python script named spark101 for this exercise.

In [14]:
import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [15]:
# Create pandas dataframe by columns using dictionary-like object
pd_df = pd.DataFrame({'language': ['Python', 'JavaScript', 'HTML', 'Java', 'C']})
# convert the pandas df to a Spark df
sp_df = spark.createDataFrame(pd_df)

In [16]:
# .printSchema()
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [17]:
sp_df.describe()

DataFrame[summary: string, language: string]

In [18]:
# output the shape of the df
sp_df.count(), sp_df.columns

(5, ['language'])

In [19]:
# 1st 5 records
sp_df.show()

+----------+
|  language|
+----------+
|    Python|
|JavaScript|
|      HTML|
|      Java|
|         C|
+----------+



2. Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:
    - The 1999 audi a4 has a 4 cylinder engine.
        - For each vehicle.

- Transform the trans column so that it only contains either manual or auto.

In [20]:
# pandas df
mpg_pd = data("mpg")
mpg_pd.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [21]:
# spark df
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [22]:
from pyspark.sql.functions import round, concat, sum, min, max, count, avg, mean
from pyspark.sql.functions import lit

#The 1999 audi a4 has a 4 cylinder engine
mpg.select(concat(lit("The "), mpg.year, mpg.manufacturer, mpg.model, 
                  lit("has a "), mpg.cyl, lit(" cylinder engine")).alias("Description")).show(10, truncate=False)

+---------------------------------------------+
|Description                                  |
+---------------------------------------------+
|The 1999audia4has a 4 cylinder engine        |
|The 1999audia4has a 4 cylinder engine        |
|The 2008audia4has a 4 cylinder engine        |
|The 2008audia4has a 4 cylinder engine        |
|The 1999audia4has a 6 cylinder engine        |
|The 1999audia4has a 6 cylinder engine        |
|The 2008audia4has a 6 cylinder engine        |
|The 1999audia4 quattrohas a 4 cylinder engine|
|The 1999audia4 quattrohas a 4 cylinder engine|
|The 2008audia4 quattrohas a 4 cylinder engine|
+---------------------------------------------+
only showing top 10 rows



In [23]:
from pyspark.sql.functions import when
# Transform the trans column so that it only contains either manual or auto.
mpg.select(mpg.trans, when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(10)

+----------+--------------------------------------------------------+
|     trans|CASE WHEN startswith(trans, a) THEN auto ELSE manual END|
+----------+--------------------------------------------------------+
|  auto(l5)|                                                    auto|
|manual(m5)|                                                  manual|
|manual(m6)|                                                  manual|
|  auto(av)|                                                    auto|
|  auto(l5)|                                                    auto|
|manual(m5)|                                                  manual|
|  auto(av)|                                                    auto|
|manual(m5)|                                                  manual|
|  auto(l5)|                                                    auto|
|manual(m6)|                                                  manual|
+----------+--------------------------------------------------------+
only showing top 10 

In [24]:
mpg = mpg.withColumn("trans", when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(6)

+------------+-----+-----+----+---+------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|   a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
only showing top 6 rows



3. Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [25]:
tips = spark.createDataFrame(data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [26]:
# What percentage of observations are smokers?
print(tips.count(), "rows", len(tips.columns), "columns")

244 rows 7 columns


In [27]:
# What percentage of observations are smokers?
tips.groupBy("smoker").count().show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [28]:
# What percentage of observations are smokers?
tips.filter(tips.smoker == "Yes").count()/tips.count()

0.38114754098360654

In [29]:
# Create a column that contains the tip percentage
tips = tips.withColumn('tip_percentage', (tips.tip/tips.total_bill))

In [34]:
# Calculate the average tip percentage for each combination of sex and smoker.
# tips.show()
tips.groupby("sex").pivot("smoker").mean("tip_percentage").show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941032|
|  Male|0.1606687151291298|0.15277117520248512|
+------+------------------+-------------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to farenheight.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low tempurature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [35]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [31]:
# cory - 

# temp = temp.withColumn("temp_max", round(temp.temp_max*9/5 + 32, 2))
# temp = temp.withColumn("temp_min", round(temp.temp_min*9/5 + 32, 2))
# temp.show()

SyntaxError: invalid syntax (<ipython-input-31-6bf4a8fb5fe9>, line 1)