# Spark API

## Create a spark data frame that contains your favorite programming languages.

In [19]:
import pandas as pd

import pyspark
from pyspark.sql.functions import *

from pydataset import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

### The name of the column should be language

In [2]:
languages = ['Python', 'SQL', 'Javascript', 'Java', 'Go', 'Julia']

In [3]:
df = spark.createDataFrame(pd.DataFrame(languages, columns=['language']))
df

DataFrame[language: string]

### View the schema of the dataframe


In [4]:
df.printSchema()

root
 |-- language: string (nullable = true)



### Output the shape of the dataframe

In [5]:
print('Number of Columns:', len(df.columns))
print('Number of Rows:', df.count())

Number of Columns: 1
Number of Rows: 6


### Show the first 5 records in the dataframe

In [6]:
df.show(5)

+----------+
|  language|
+----------+
|    Python|
|       SQL|
|Javascript|
|      Java|
|        Go|
+----------+
only showing top 5 rows



## Load the mpg dataset as a spark dataframe.

In [7]:
mpg = spark.createDataFrame(data('mpg'))
mpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [8]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

### Create 1 column of output that contains a message like the one below:

`The 1999 audi a4 has a 4 cylinder engine.`

In [9]:
(mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a '), mpg.cyl, lit(' cylinder engine.'))
    .alias('vehicle_info'))
    .show(5, False)
)

+-----------------------------------------+
|vehicle_info                             |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



### Transform the trans column so that it only contains either manual or auto.

In [13]:
mpg.select(
    regexp_extract("trans", r"^(\w+)\(", 1).alias('trans_type')
).show()

+----------+
|trans_type|
+----------+
|      auto|
|    manual|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|    manual|
|      auto|
|    manual|
|      auto|
|      auto|
|      auto|
|      auto|
|      auto|
+----------+
only showing top 20 rows



## Load the tips dataset as a spark dataframe.

In [14]:
tips = spark.createDataFrame(data('tips'))
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

### What percentage of observations are smokers?

In [24]:
tips.groupBy('smoker').count().withColumn('pct_of_customers', 
                                          round(col('count') / tips.count() * 100, 2)).show()

+------+-----+----------------+
|smoker|count|pct_of_customers|
+------+-----+----------------+
|    No|  151|           61.89|
|   Yes|   93|           38.11|
+------+-----+----------------+



### Create a column that contains the tip percentage

In [27]:
tips.withColumn('tip_pct', round(tips.tip / tips.total_bill * 100, 2)).show()

+----------+----+------+------+---+------+----+-------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_pct|
+----------+----+------+------+---+------+----+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|   5.94|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|  16.05|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|  16.66|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|  13.98|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|  14.68|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|  18.62|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|  22.81|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|  11.61|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|  13.03|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|  21.85|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|  16.65|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|  14.18|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|  10.18|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|  16.28|
|     14.83|3.02|Female|    No|Sun|Dinner|   2| 