# Spark Fundamentals Exercise

In [1]:
import pandas as pd

import pyspark
from pyspark.sql.functions import col, exp, lit, concat, regexp_extract, regexp_replace, expr
from pyspark.sql.functions import round, sum, avg, min, max, count, mean
from pyspark.sql.functions import udf, year, month, quarter, asc, desc
from pyspark.sql.types import FloatType

from pydataset import data
from vega_datasets import data as vega_data

In [2]:
# Create a spark instance/session

spark = pyspark.sql.SparkSession.Builder().getOrCreate()

### 1. Create a spark data frame that contains your favorite programming languages.

    The name of the column should be language
    View the schema of the dataframe
    Output the shape of the dataframe
    Show the first 5 records in the dataframe

In [3]:
# Create a pandas dataframe object with a column named language.
df_langs = pd.DataFrame({'language':['Spark', 'Pandas', 'Numpy',
                                     'Python', 'PyTorch', 'Qiskit']})

# Pass the pandas dataframe as an argument to create a spark dataframe.
df = spark.createDataFrame(df_langs)

In [4]:
type(df)

pyspark.sql.dataframe.DataFrame

In [5]:
# View the schema of the programming language dataframe.
df.printSchema()

root
 |-- language: string (nullable = true)



In [6]:
# View the shape of the Spark dataframe.
df.count(), len(df.columns)

(6, 1)

In [7]:
# View the top 5 rows of the pandas dataframe.
df.show(5)

+--------+
|language|
+--------+
|   Spark|
|  Pandas|
|   Numpy|
|  Python|
| PyTorch|
+--------+
only showing top 5 rows



In [8]:
# Returns the first element from the column named 'language'
print(df.limit(1).collect())
print(df.head(1))

[Row(language='Spark')]
[Row(language='Spark')]


In [9]:
df.summary().show()

+-------+--------+
|summary|language|
+-------+--------+
|  count|       6|
|   mean|    null|
| stddev|    null|
|    min|   Numpy|
|    25%|    null|
|    50%|    null|
|    75%|    null|
|    max|   Spark|
+-------+--------+



In [10]:
df.toPandas()

Unnamed: 0,language
0,Spark
1,Pandas
2,Numpy
3,Python
4,PyTorch
5,Qiskit


### 2. Load the mpg dataset as a spark dataframe.

    Create 1 column of output that contains a message like the one below:

    The 1999 audi a4 has a 4 cylinder engine.

    Transform the `trans` column so it only contains 'manual' or 'auto'.

In [11]:
# Create a Spark dataframe using a pandas dataframe
# Data source is pydataset --- 'mpg'
mpg = spark.createDataFrame(data('mpg'))

In [12]:
# Look at the first few rows to understand which columns hold
# the information we need to make a descriptive sentence column.

mpg.show(2)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 2 rows



In [13]:
# Create a new column named `vehicle_summary` to store
# the summary of each vehicle.

# The .withColumn() function creates a new column
# for spark dataframes- similar to pandas .assign()
mpg = (
mpg.withColumn(colName='vehicle_summary',
               col=concat(lit('The '), # Strings that are not values in the spark dataframe need to be called with the lit() function
                          'year',  # Column names in a spark dataframe can be referenced by name. The function is called directly on the spark dataframe
                          lit(' '),
                          'manufacturer',
                          lit(' '),
                          'model',
                          lit(' has a '),
                          'cyl',
                          lit(' cylinder engine.'))
              )
)

In [14]:
# To display all the text in a column, we need to set truncate=False in .show()
# Truncate will truncate all strings longer than 20 characters by default.
mpg.select('vehicle_summary').show(n=5, truncate=False)

+-----------------------------------------+
|vehicle_summary                          |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



In [15]:
# Transform the trans column so that it only contains either manual or auto.
# Use the `.select()` function to grab the transmission column.

# I include the 'trans' column to compare the transformed column with the original.
mpg.select('trans',
           regexp_extract('trans', '(\w+)', 1).alias('transmission')
          ).show()

+----------+------------+
|     trans|transmission|
+----------+------------+
|  auto(l5)|        auto|
|manual(m5)|      manual|
|manual(m6)|      manual|
|  auto(av)|        auto|
|  auto(l5)|        auto|
|manual(m5)|      manual|
|  auto(av)|        auto|
|manual(m5)|      manual|
|  auto(l5)|        auto|
|manual(m6)|      manual|
|  auto(s6)|        auto|
|  auto(l5)|        auto|
|manual(m5)|      manual|
|  auto(s6)|        auto|
|manual(m6)|      manual|
|  auto(l5)|        auto|
|  auto(s6)|        auto|
|  auto(s6)|        auto|
|  auto(l4)|        auto|
|  auto(l4)|        auto|
+----------+------------+
only showing top 20 rows



In [16]:
# Use the `regexp_function()` to extract the transmission class: auto and manual
# from the `trans` column.

mpg = mpg.withColumn(colName='transmission',
                     col=regexp_extract('trans', '(\w+)', 1))

In [17]:
mpg.select('trans', 'transmission').show(3)

+----------+------------+
|     trans|transmission|
+----------+------------+
|  auto(l5)|        auto|
|manual(m5)|      manual|
|manual(m6)|      manual|
+----------+------------+
only showing top 3 rows



### 3. Load the tips dataset as a spark dataframe.

    What percentage of observations are smokers?
    Create a column that contains the tip percentage
    Calculate the average tip percentage for each combination of sex and smoker.

In [18]:
# Load the tips dataset from pydataset and pass the returned
# Dataframe into a spark dataframe
tips = spark.createDataFrame(data('tips'))

In [19]:
tips.count(), len(tips.columns)

(244, 7)

In [20]:
# Display information about the column dtypes
tips.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)



In [21]:
# Display the first 5 rows to understand what a single observation represents
# Row == Paying Customer
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [22]:
# Display descriptive statistics for each column.
tips.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950813|  2.99827868852459|  null|  null|null|  null| 2.569672131147541|
| stddev| 8.902411954856856|1.3836381890011817|  null|  null|null|  null|0.9510998047322345|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



In [23]:
# number of smokers
tips.select(sum((tips.smoker == 'Yes').cast('int')).alias('num_smokers'),
            count(tips.smoker).alias('total_customers')).show()

+-----------+---------------+
|num_smokers|total_customers|
+-----------+---------------+
|         93|            244|
+-----------+---------------+



In [24]:
# What percentage of observations are smokers?

# To calcaulte the percentage of smokers, the output of
# the boolean expression  must be cast into an integer.
print("Percentage of customers who smoke")
tips.select(
    round(avg((col('smoker') == 'Yes').cast('int')), 2)
    .alias('pct_of_smokers')
).show()

Percentage of customers who smoke
+--------------+
|pct_of_smokers|
+--------------+
|          0.38|
+--------------+



In [25]:
# Use the withColumn function to create a new column using
# existing columns from the tips spark dataframe

tips = tips.withColumn(colName='tip_pct',
                       col=round(col('tip') / col('total_bill'), 4)
                      )

In [26]:
tips.show(2)

+----------+----+------+------+---+------+----+-------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_pct|
+----------+----+------+------+---+------+----+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 0.0594|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.1605|
+----------+----+------+------+---+------+----+-------+
only showing top 2 rows



In [27]:
# Calculate the average tip percentage for each combination of sex and smoker
# Groupby sex and smoker columns, and calculate the average tip per customer group
# Alias the calculation with an explicit name.
tips.groupBy('sex', 'smoker').agg(round(avg('tip_pct'),4).alias('avg_tip_pct')).sort('sex').show()

+------+------+-----------+
|   sex|smoker|avg_tip_pct|
+------+------+-----------+
|Female|    No|     0.1569|
|Female|   Yes|     0.1821|
|  Male|    No|     0.1607|
|  Male|   Yes|     0.1528|
+------+------+-----------+



### 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    Convert the temperatures to fahrenheit.
    Which month has the most rain, on average?
    Which year was the windiest?
    What is the most frequent type of weather in January?
    What is the average high and low temperature on sunny days in July in 2013 and 2014?
    What percentage of days were rainy in q3 of 2015?
    For each year, find what percentage of days it rained (had non-zero precipitation).

In [28]:
# Assign the seattle weather dataset to a variable.
# Use .assign to change the type of the `date` column to a string.
weather = vega_data.seattle_weather().assign(date=lambda df: df.date.astype(str))

# Create a spark dataframe from the weather dataframe.
weather = spark.createDataFrame(weather)

In [29]:
# Display the shape of the spark dataframe.
weather.count(), len(weather.columns)

(1461, 6)

In [30]:
# Display the column types of the spark dataframe using .printSchema()
weather.printSchema()

root
 |-- date: string (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- temp_max: double (nullable = true)
 |-- temp_min: double (nullable = true)
 |-- wind: double (nullable = true)
 |-- weather: string (nullable = true)



In [31]:
# Display the first 3 rows of the dataframe to understand what an observation represents.
weather.show(3)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 3 rows



In [32]:
# Display descriptive statistics of each column using .describe().show()
weather.describe().show()

+-------+----------+-----------------+------------------+-----------------+------------------+-------+
|summary|      date|    precipitation|          temp_max|         temp_min|              wind|weather|
+-------+----------+-----------------+------------------+-----------------+------------------+-------+
|  count|      1461|             1461|              1461|             1461|              1461|   1461|
|   mean|      null| 3.02943189596167|16.439082819986307|8.234770704996578|3.2411362080766595|   null|
| stddev|      null|6.680194322314738| 7.349758097360178|5.023004179961266|1.4378250588746198|   null|
|    min|2012-01-01|              0.0|              -1.6|             -7.1|               0.4|drizzle|
|    max|2015-12-31|             55.9|              35.6|             18.3|               9.5|    sun|
+-------+----------+-----------------+------------------+-----------------+------------------+-------+



In [33]:
#C onvert the temperatures to farenheit.
# C = 5/9 x (F-32)

# Create a user defined function from pyspark.sql.functions.udf
# that converts temperatures from Celsius to Fahrenheit.
udf_temp_conversion = udf(lambda temp: temp * (9/5) + 32, FloatType())

# Transform the max temperature from Celsius to Fahrenheit
weather = weather.withColumn(colName='temp_max',
                             col=udf_temp_conversion(col('temp_max')))

# Transform the min temperature from Celsius to Fahrenheit
weather = weather.withColumn(colName='temp_min',
                             col=udf_temp_conversion(col('temp_min')))
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [34]:
# Which month has the most rain, on average?
print('October has the most rain on average.')
(
weather
.filter(col('weather') == lit('rain'))
.groupBy(month('date').alias('month'))
.agg(avg('precipitation').alias('avg_rain'))
.sort(col('avg_rain').desc())
.show(1)
)

October has the most rain on average.
+-----+--------+
|month|avg_rain|
+-----+--------+
|   10|   9.675|
+-----+--------+
only showing top 1 row



In [35]:
# Which year was the windiest?
print("2012 was the windiest year on average.")
(
weather
.groupBy(year('date').alias('year'))
.agg(avg('wind').alias('avg_wind'))
.sort(col('avg_wind').desc())
.show(1)
)

2012 was the windiest year on average.
+----+------------------+
|year|          avg_wind|
+----+------------------+
|2012|3.4008196721311483|
+----+------------------+
only showing top 1 row



In [36]:
# What is the most frequent type of weather in January?
print('The most frequent type of weather in January is Fog.')
(
weather.filter(month('date') == 1)
.groupBy('weather')
.agg(count('weather').alias('count'))
.sort(col('count').desc())
.show(1)
)

The most frequent type of weather in January is Fog.
+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
+-------+-----+
only showing top 1 row



In [37]:
# What is the average high and low temperature on sunny days in July in 2013 and 2014?
print("The average high and lows for 2013 and 2014.")
(
weather
    
# Filter the dataset for sunny weather conditions (.filter() is an alias for .where())
# .filter(col('weather') == 'sun')
.where(col('weather') == 'sun')
    
# in 2013 and 2014
.where(year('date').isin([2013, 2014]))

# in July 
.where(month('date') == 7)

# Group July weather data by year
.groupBy(year('date').alias('year'))
    
# Calculate the average max and min temperatures
.agg(round(avg('temp_max'), 2).alias('avg_max_temp'),
     round(avg('temp_min'), 2).alias('avg_min_temp'))
    
# Display the results.
.show()
)


The average high and lows for 2013 and 2014.
+----+------------+------------+
|year|avg_max_temp|avg_min_temp|
+----+------------+------------+
|2013|       79.85|       57.17|
|2014|       80.77|       57.92|
+----+------------+------------+



In [38]:
# What percentage of days were rainy in q3 of 2015?
print("The percentage of rainy days in Q3 of 2015.")
(
weather
    
# Filter 2015 temperature data
.filter(year('date') == 2015)
    
# In Q3 of 2015
.where(quarter('date') == 3)
    
# calculate the percentage of rain days
.select(round(avg((col('weather') == 'rain').cast('int')), 4)
        .alias('Q32015_pct_days_with_precipitation'))

# Display the results
.show()
)

The percentage of rainy days in Q3 of 2015.
+----------------------------------+
|Q32015_pct_days_with_precipitation|
+----------------------------------+
|                            0.0217|
+----------------------------------+



In [39]:
# For each year, find what percentage of days it rained (had non-zero precipitation).
print('The average percentage of days per year with non-zero precipitation.')
(
weather
    
# Select the years from the date column
.select(year('date').alias('year'),

# Create a Boolean series of non-zero precipitation days.
(col('precipitation') != 0).cast('int').alias('precipitation'))
    
# Group by the year column
.groupBy('year')
    
# Calculate the average number of days with percipitation for each year.
.agg(round(mean('precipitation'), 3).alias('avg_precipitation'))
    
# Display the results.
.show()
)

The average percentage of days per year with non-zero precipitation.
+----+-----------------+
|year|avg_precipitation|
+----+-----------------+
|2015|            0.395|
|2013|            0.416|
|2014|            0.411|
|2012|            0.484|
+----+-----------------+

