In [1]:
# Imports and create spark 

import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

### 1. Create a spark data frame that contains your favorite programming languages.
- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
# Create pandas dataframe by columns using dictionary-like object

pd_df = pd.DataFrame({'language': ['Python', 'JavaScript', 'HTML', 'Java', 'C']})
# convert the pandas df to a Spark df
sp_df = spark.createDataFrame(pd_df)

In [3]:
# .printSchema()
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [4]:
sp_df.describe()

DataFrame[summary: string, language: string]

In [5]:
# 
def spark_shape(self):
    return (self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [6]:
# First 5 records
sp_df.show()

+----------+
|  language|
+----------+
|    Python|
|JavaScript|
|      HTML|
|      Java|
|         C|
+----------+



### 2. Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:
- The 1999 audi a4 has a 4 cylinder engine.
- For each vehicle.
- Transform the trans column so that it only contains either manual or auto.

In [7]:
# Create pandas df

mpg_pd = data("mpg")
mpg_pd.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [8]:
# Tranform to spark df

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import round, concat, sum, min, max, count, avg, mean
from pyspark.sql.functions import lit

# The 1999 audi a4 has a 4 cylinder engine
mpg.select(concat(lit("The "), mpg.year, mpg.manufacturer, mpg.model, 
                  lit("has a "), mpg.cyl, lit(" cylinder engine")).alias("Description")).show(10, truncate=False)

+---------------------------------------------+
|Description                                  |
+---------------------------------------------+
|The 1999audia4has a 4 cylinder engine        |
|The 1999audia4has a 4 cylinder engine        |
|The 2008audia4has a 4 cylinder engine        |
|The 2008audia4has a 4 cylinder engine        |
|The 1999audia4has a 6 cylinder engine        |
|The 1999audia4has a 6 cylinder engine        |
|The 2008audia4has a 6 cylinder engine        |
|The 1999audia4 quattrohas a 4 cylinder engine|
|The 1999audia4 quattrohas a 4 cylinder engine|
|The 2008audia4 quattrohas a 4 cylinder engine|
+---------------------------------------------+
only showing top 10 rows



In [10]:
from pyspark.sql.functions import when

# Transform the trans column so that it only contains either manual or auto.
mpg.select(mpg.trans, when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(10)

+----------+--------------------------------------------------------+
|     trans|CASE WHEN startswith(trans, a) THEN auto ELSE manual END|
+----------+--------------------------------------------------------+
|  auto(l5)|                                                    auto|
|manual(m5)|                                                  manual|
|manual(m6)|                                                  manual|
|  auto(av)|                                                    auto|
|  auto(l5)|                                                    auto|
|manual(m5)|                                                  manual|
|  auto(av)|                                                    auto|
|manual(m5)|                                                  manual|
|  auto(l5)|                                                    auto|
|manual(m6)|                                                  manual|
+----------+--------------------------------------------------------+
only showing top 10 

In [11]:
# 
mpg = mpg.withColumn("trans", when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(10)

+------------+----------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual|  4| 20| 28|  p|compact|
+------------+----------+-----+----+---+------+---+---+---+---+-