In [20]:
# importing and creating spark session
import pyspark
import pandas as pd
import numpy as np

from pydataset import data
from pyspark.sql.functions import lit
from pyspark.sql.functions import round, concat, sum, min, max, count, avg, mean, when, asc, desc, month, year, quarter
from pyspark.sql.functions import regexp_extract, regexp_replace
from vega_datasets import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

## Create a spark data frame that contains your favorite programming languages.
## The name of the column should be language.

In [17]:
# creating language dataframe
pd_df = pd.DataFrame({'Language':['python','ruby', 'java', 'c++', 'javascript']})

# converting to spark df
df = spark.createDataFrame(pd_df)

df

DataFrame[Language: string]

## View the schema of the dataframe

In [3]:
# printing schema
df.printSchema()

root
 |-- Language: string (nullable = true)



## Output the shape of the dataframe

In [4]:
# outputting df shape
print((df.count(), len(df.columns)))

(5, 1)


## Show the first 5 records in the dataframe

In [5]:
# displaying first 5 df records
df.show(5)

+----------+
|  Language|
+----------+
|    python|
|      ruby|
|      java|
|       c++|
|javascript|
+----------+



## Load the mpg dataset as a spark dataframe.

In [6]:
# loading mpg dataset
mpg = spark.createDataFrame(data("mpg"))

# displaying df
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



## Create 1 column of output that contains a message like the one below:

#### The 1999 audi a4 has a 4 cylinder engine.
####  For each vehicle.

In [7]:
# creating output column by concating strings with dat from df
mpg.select(concat(lit("The "), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a '), mpg.cyl, lit(' cylinder engine.')).alias("cylinders")).show(5, False)

+-----------------------------------------+
|cylinders                                |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



## Transform the trans column so that it only contains either manual or auto.

In [8]:
# transforming trans to only include specified characters
mpg.select("trans", 
              regexp_extract("trans", 
                             r"^(\w+)", 1).alias("trans_transformed")).show(truncate=False)

+----------+-----------------+
|trans     |trans_transformed|
+----------+-----------------+
|auto(l5)  |auto             |
|manual(m5)|manual           |
|manual(m6)|manual           |
|auto(av)  |auto             |
|auto(l5)  |auto             |
|manual(m5)|manual           |
|auto(av)  |auto             |
|manual(m5)|manual           |
|auto(l5)  |auto             |
|manual(m6)|manual           |
|auto(s6)  |auto             |
|auto(l5)  |auto             |
|manual(m5)|manual           |
|auto(s6)  |auto             |
|manual(m6)|manual           |
|auto(l5)  |auto             |
|auto(s6)  |auto             |
|auto(s6)  |auto             |
|auto(l4)  |auto             |
|auto(l4)  |auto             |
+----------+-----------------+
only showing top 20 rows



## Load the tips dataset as a spark dataframe.

In [9]:
# loading tips dataset
tips = spark.createDataFrame(data("tips"))

# displaying df
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



## What percentage of observations are smokers?

In [10]:
# calculating percentage of smokers 
tips.filter(tips.smoker == 'Yes').count() / tips.count()

0.38114754098360654

## Create a column that contains the tip percentage

In [11]:
# creating column that contains tip percentage
tips.select(tips.total_bill.alias('total_bill'), 
            tips.tip.alias('tip'),
            (tips.tip / tips.total_bill).alias('tip percentage')).show(3)

+----------+----+-------------------+
|total_bill| tip|     tip percentage|
+----------+----+-------------------+
|     16.99|1.01|0.05944673337257211|
|     10.34|1.66|0.16054158607350097|
|     21.01| 3.5|0.16658733936220846|
+----------+----+-------------------+
only showing top 3 rows



## Calculate the average tip percentage for each combination of sex and smoker.

In [12]:
# calculating avg tip percentage for each combination of sex and smoker
tips.groupBy("sex", "smoker").agg(avg(tips.tip)).show()

+------+------+------------------+
|   sex|smoker|          avg(tip)|
+------+------+------------------+
|  Male|    No|3.1134020618556697|
|  Male|   Yes|3.0511666666666666|
|Female|    No| 2.773518518518518|
|Female|   Yes|2.9315151515151516|
+------+------+------------------+



## Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [15]:
# from vega_datasets import data

# importing weather dataset 
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))

# converting dataset to spark df
weather = spark.createDataFrame(weather)

# displaying df
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



## Convert the temperatures to farenheight.

In [16]:
# converting temp to farenheight using formula
weather = weather.withColumn("temp_max", round(weather.temp_max*9/5 + 32))
weater = weather.withColumn("temp_min", round(weather.temp_min*9/5 + 32))

# displaying df
weather.show()

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    55.0|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    51.0|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    53.0|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    54.0|     5.6| 4.7|   rain|
|2012-01-05|          1.3|    48.0|     2.8| 6.1|   rain|
|2012-01-06|          2.5|    40.0|     2.2| 2.2|   rain|
|2012-01-07|          0.0|    45.0|     2.8| 2.3|   rain|
|2012-01-08|          0.0|    50.0|     2.8| 2.0|    sun|
|2012-01-09|          4.3|    49.0|     5.0| 3.4|   rain|
|2012-01-10|          1.0|    43.0|     0.6| 3.4|   rain|
|2012-01-11|          0.0|    43.0|    -1.1| 5.1|    sun|
|2012-01-12|          0.0|    43.0|    -1.7| 1.9|    sun|
|2012-01-13|          0.0|    41.0|    -2.8| 1.3|    sun|
|2012-01-14|          4.1|    40.0|     0.6| 5.3|   snow|
|2012-01-15|  

## Which month has the most rain, on average?

In [29]:
# calculating average rainfall each month
# sorting by month
weather.withColumn("month", month("date")).groupBy("month")\
.agg(round(sum("precipitation"), 2).alias("total_rainfall")).sort("month").show()


+-----+--------------+
|month|total_rainfall|
+-----+--------------+
|    1|         466.0|
|    2|         422.0|
|    3|         606.2|
|    4|         375.4|
|    5|         207.5|
|    6|         132.9|
|    7|          48.2|
|    8|         163.7|
|    9|         235.5|
|   10|         503.4|
|   11|         642.5|
|   12|         622.7|
+-----+--------------+



## Which year was the windiest?

In [34]:
# calculating average wind each year
# sorting by year
weather.withColumn("year", year("date")).groupBy("year")\
.agg(round(sum("wind"), 2).alias("average_wind")).sort("year").show()

+----+------------+
|year|average_wind|
+----+------------+
|2012|      1244.7|
|2013|      1100.8|
|2014|      1236.5|
|2015|      1153.3|
+----+------------+



## What is the most frequent type of weather in January?

## What is the average high and low tempurature on sunny days in July in 2013 and 2014?

## What percentage of days were rainy in q3 of 2015?

## For each year, find what percentage of days it rained (had non-zero precipitation).