In [47]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from vega_datasets import data


# Question 1 Fav Languages

In [2]:
langs = ['SQL','Python','R','html','javascript','php','css']
pandas_dataframe = pd.DataFrame({'language':langs}
)
pandas_dataframe


Unnamed: 0,language
0,SQL
1,Python
2,R
3,html
4,javascript
5,php
6,css


In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/19 10:10:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.createDataFrame(pandas_dataframe)

In [5]:
df.show()

                                                                                

+----------+
|  language|
+----------+
|       SQL|
|    Python|
|         R|
|      html|
|javascript|
|       php|
|       css|
+----------+



In [6]:
df.printSchema()

root
 |-- language: string (nullable = true)



In [7]:
df.count(),len(df.columns)

(7, 1)

In [8]:
df.show(5)

+----------+
|  language|
+----------+
|       SQL|
|    Python|
|         R|
|      html|
|javascript|
+----------+
only showing top 5 rows



# Question 2 MPG dataset....let's do this!

In [9]:
mpg1 = data('mpg')

In [10]:
df = pd.DataFrame(mpg1)

In [11]:
mpg = spark.createDataFrame(mpg1)

In [12]:
mpg.describe().show()

23/05/19 10:11:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 11:>                                                         (0 + 1) / 1]

+-------+------------+-----------------+------------------+-----------------+-----------------+----------+---+------------------+-----------------+----+-------+
|summary|manufacturer|            model|             displ|             year|              cyl|     trans|drv|               cty|              hwy|  fl|  class|
+-------+------------+-----------------+------------------+-----------------+-----------------+----------+---+------------------+-----------------+----+-------+
|  count|         234|              234|               234|              234|              234|       234|234|               234|              234| 234|    234|
|   mean|        null|             null| 3.471794871794872|           2003.5|5.888888888888889|      null|4.0|16.858974358974358|23.44017094017094|null|   null|
| stddev|        null|             null|1.2919590310839348|4.509646313320436|1.611534484684289|      null|0.0| 4.255945678889394|5.954643441166448|null|   null|
|    min|        audi|      4runne

                                                                                

In [13]:
mpg.count()

234

In [14]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [15]:
mpg.columns

['manufacturer',
 'model',
 'displ',
 'year',
 'cyl',
 'trans',
 'drv',
 'cty',
 'hwy',
 'fl',
 'class']

In [16]:
mpg.select(
    F.concat(F.lit('The '),F.col('year'),F.lit(' '),F.col('manufacturer'),F.lit(' '),F.col('model'),
             F.lit(' has a '),F.col('cyl'), F.lit(' cylinder engine')).alias('summary')
).show(5,truncate=False)

+----------------------------------------+
|summary                                 |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



In [17]:
df.trans.unique()

array(['auto(l5)', 'manual(m5)', 'manual(m6)', 'auto(av)', 'auto(s6)',
       'auto(l4)', 'auto(l3)', 'auto(l6)', 'auto(s5)', 'auto(s4)'],
      dtype=object)

In [18]:
df_trans = pd.DataFrame(df.trans.unique())

In [19]:
df_trans

Unnamed: 0,0
0,auto(l5)
1,manual(m5)
2,manual(m6)
3,auto(av)
4,auto(s6)
5,auto(l4)
6,auto(l3)
7,auto(l6)
8,auto(s5)
9,auto(s4)


In [20]:
manual = ['manual(m5)','manual(m6)']
mpg.select(
    mpg.trans,
    F.when((mpg.trans == 'manual(m5)')|(mpg.trans == 'manual(m6)'), 'MAN').\
    otherwise('AUTO').alias('trans_cat')
).show(5)

+----------+---------+
|     trans|trans_cat|
+----------+---------+
|  auto(l5)|     AUTO|
|manual(m5)|      MAN|
|manual(m6)|      MAN|
|  auto(av)|     AUTO|
|  auto(l5)|     AUTO|
+----------+---------+
only showing top 5 rows



# Question 3 TIPS dataset

In [21]:
tips = data('tips')

In [22]:
df = pd.DataFrame(tips)

In [23]:
tips = spark.createDataFrame(tips)

In [24]:
tips.describe().show()



+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950813|2.9982786885245907|  null|  null|null|  null| 2.569672131147541|
| stddev| 8.902411954856856| 1.383638189001182|  null|  null|null|  null|0.9510998047322344|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



                                                                                

In [25]:
tips.count()

244

In [26]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [27]:
tips.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [28]:
tips.groupby(tips.smoker).agg(F.count(tips.smoker)).show()

+------+-------------+
|smoker|count(smoker)|
+------+-------------+
|    No|          151|
|   Yes|           93|
+------+-------------+



In [29]:
tips.groupby(tips.smoker).agg(F.count(tips.smoker).alias('count')).show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [30]:
tips.groupby('smoker').agg(
    (F.count('smoker')).alias('count'),
    (F.count('smoker') / tips.count()).alias('percentage')
).show()

+------+-----+-------------------+
|smoker|count|         percentage|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



In [31]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [32]:
tips.select('*',
           F.round((tips.tip / tips.total_bill),2).alias('pct')).show(5)

+----------+----+------+------+---+------+----+----+
|total_bill| tip|   sex|smoker|day|  time|size| pct|
+----------+----+------+------+---+------+----+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.06|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.17|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|0.14|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.15|
+----------+----+------+------+---+------+----+----+
only showing top 5 rows



In [33]:
tips.select('*',
           F.round((tips.tip / tips.total_bill),2).alias('pct')).show(5)

+----------+----+------+------+---+------+----+----+
|total_bill| tip|   sex|smoker|day|  time|size| pct|
+----------+----+------+------+---+------+----+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.06|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.17|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|0.14|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.15|
+----------+----+------+------+---+------+----+----+
only showing top 5 rows



In [38]:
tips.groupby(tips.sex,tips.smoker).agg(F.round(F.avg(tips.tip / tips.total_bill),3).alias('pct')).show()

+------+------+-----+
|   sex|smoker|  pct|
+------+------+-----+
|  Male|    No|0.161|
|Female|    No|0.157|
|  Male|   Yes|0.153|
|Female|   Yes|0.182|
+------+------+-----+



# Question 4 SEATTLE WX dataset

In [40]:
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [42]:
weather.printSchema()

root
 |-- date: string (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- temp_max: double (nullable = true)
 |-- temp_min: double (nullable = true)
 |-- wind: double (nullable = true)
 |-- weather: string (nullable = true)



In [41]:
weather.describe().show()

[Stage 50:>                                                         (0 + 8) / 8]

+-------+----------+-----------------+------------------+-----------------+------------------+-------+
|summary|      date|    precipitation|          temp_max|         temp_min|              wind|weather|
+-------+----------+-----------------+------------------+-----------------+------------------+-------+
|  count|      1461|             1461|              1461|             1461|              1461|   1461|
|   mean|      null| 3.02943189596167|16.439082819986314|8.234770704996578|3.2411362080766604|   null|
| stddev|      null|6.680194322314738| 7.349758097360176|5.023004179961266|1.4378250588746198|   null|
|    min|2012-01-01|              0.0|              -1.6|             -7.1|               0.4|drizzle|
|    max|2015-12-31|             55.9|              35.6|             18.3|               9.5|    sun|
+-------+----------+-----------------+------------------+-----------------+------------------+-------+



                                                                                

In [43]:
weather.count()

1461

In [44]:
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [45]:
weather.select('*',
           F.round((weather.temp_max * 1.8 + 32),1).alias('F_max')).show(5)

+----------+-------------+--------+--------+----+-------+-----+
|      date|precipitation|temp_max|temp_min|wind|weather|F_max|
+----------+-------------+--------+--------+----+-------+-----+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle| 55.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain| 51.1|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain| 53.1|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain| 54.0|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain| 48.0|
+----------+-------------+--------+--------+----+-------+-----+
only showing top 5 rows



In [46]:
weather.select('*',
           F.round((weather.temp_max * 1.8 + 32),1).alias('F_max'),
              F.round((weather.temp_min * 1.8 + 32),1).alias('F_min')).show(5)

+----------+-------------+--------+--------+----+-------+-----+-----+
|      date|precipitation|temp_max|temp_min|wind|weather|F_max|F_min|
+----------+-------------+--------+--------+----+-------+-----+-----+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle| 55.0| 41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain| 51.1| 37.0|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain| 53.1| 45.0|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain| 54.0| 42.1|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain| 48.0| 37.0|
+----------+-------------+--------+--------+----+-------+-----+-----+
only showing top 5 rows



In [49]:
weather.select('*',
               weather.date.cast('date').alias('d'),
           F.round((weather.temp_max * 1.8 + 32),1).alias('F_max'),
              F.round((weather.temp_min * 1.8 + 32),1).alias('F_min')).show(5)

+----------+-------------+--------+--------+----+-------+----------+-----+-----+
|      date|precipitation|temp_max|temp_min|wind|weather|         d|F_max|F_min|
+----------+-------------+--------+--------+----+-------+----------+-----+-----+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|2012-01-01| 55.0| 41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|2012-01-02| 51.1| 37.0|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|2012-01-03| 53.1| 45.0|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|2012-01-04| 54.0| 42.1|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|2012-01-05| 48.0| 37.0|
+----------+-------------+--------+--------+----+-------+----------+-----+-----+
only showing top 5 rows



In [127]:
weather.withColumn('month', F.month(F.col('date'))).groupby(
    'month').agg(
    F.round((F.sum(weather.precipitation))/F.count(weather.date), 2).alias('daily_avg_rain')).sort('daily_avg_rain').show()

[Stage 181:>                                                        (0 + 8) / 8]

+-----+--------------+
|month|daily_avg_rain|
+-----+--------------+
|    7|          0.39|
|    6|          1.11|
|    8|          1.32|
|    5|          1.67|
|    9|          1.96|
|    4|          3.13|
|    2|          3.73|
|    1|          3.76|
|   10|          4.06|
|    3|          4.89|
|   12|          5.02|
|   11|          5.35|
+-----+--------------+



                                                                                

In [68]:
weather.withColumn('year', F.year(F.col('date'))).groupby(
    'year').agg(
    F.round((F.sum(weather.wind))/F.count(weather.date), 2).alias('daily_avg_wind')).sort('year').show()

+----+--------------+
|year|daily_avg_wind|
+----+--------------+
|2012|           3.4|
|2013|          3.02|
|2014|          3.39|
|2015|          3.16|
+----+--------------+



In [68]:
weather.withColumn('year', F.year(F.col('date'))).groupby(
    'year').agg(
    F.round((F.sum(weather.wind))/F.count(weather.date), 2).alias('daily_avg_wind')).sort('year').show()

+----+--------------+
|year|daily_avg_wind|
+----+--------------+
|2012|           3.4|
|2013|          3.02|
|2014|          3.39|
|2015|          3.16|
+----+--------------+



In [86]:
weather.filter(F.month(F.col('date')) == 1).withColumn('month', F.month(F.col('date'))).groupby(
    'month','weather').agg(
    F.count(weather.weather).alias('count')).sort(F.count(weather.weather).desc()).show()



+-----+-------+-----+
|month|weather|count|
+-----+-------+-----+
|    1|    fog|   38|
|    1|   rain|   35|
|    1|    sun|   33|
|    1|drizzle|   10|
|    1|   snow|    8|
+-----+-------+-----+



                                                                                

In [97]:
weather.filter((F.month(F.col('date')) == 7)&((F.year(F.col('date')) == 2013)|(F.year(F.col('date')) == 2014))&(F.col('weather')=='sun')).withColumn('month', F.month(F.col('date'))).withColumn('year', F.year(F.col('date'))).groupby(
    'month','year','weather').agg(
    F.round((F.avg(weather.temp_max*1.8+32)), 1).alias('daily_avg_high'),
    F.round((F.avg(weather.temp_min*1.8+32)), 1).alias('daily_avg_low')).sort('month').show()

[Stage 142:>                                                        (0 + 8) / 8]

+-----+----+-------+--------------+-------------+
|month|year|weather|daily_avg_high|daily_avg_low|
+-----+----+-------+--------------+-------------+
|    7|2013|    sun|          79.9|         57.2|
|    7|2014|    sun|          80.8|         57.9|
+-----+----+-------+--------------+-------------+



                                                                                

In [99]:
weather.filter((F.quarter(F.col('date')) == 3)&(F.year(F.col('date')) == 2015)).withColumn('q', F.quarter(F.col('date'))).groupby(
    'q','weather').agg(
    F.count(weather.weather).alias('count')).sort(F.count(weather.weather).desc()).show()

[Stage 148:>                                                        (0 + 8) / 8]

+---+-------+-----+
|  q|weather|count|
+---+-------+-----+
|  3|    sun|   64|
|  3|    fog|   21|
|  3|drizzle|    5|
|  3|   rain|    2|
+---+-------+-----+



                                                                                

In [128]:
weather.filter(F.col('precipitation') > 0.0).withColumn('year', F.year(F.col('date'))).groupby('year').agg(
    F.count(weather.precipitation).alias('count')).sort('year').show()

[Stage 184:>                                                        (0 + 8) / 8]

+----+-----+
|year|count|
+----+-----+
|2012|  177|
|2013|  152|
|2014|  150|
|2015|  144|
+----+-----+



                                                                                