In [21]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql import functions as F

# Question 1 Fav Languages

In [2]:
langs = ['SQL','Python','R','html','javascript','php','css']
pandas_dataframe = pd.DataFrame({'language':langs}
)
pandas_dataframe


Unnamed: 0,language
0,SQL
1,Python
2,R
3,html
4,javascript
5,php
6,css


In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/18 16:30:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/18 16:30:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
df = spark.createDataFrame(pandas_dataframe)

In [6]:
df.show()

                                                                                

+----------+
|  language|
+----------+
|       SQL|
|    Python|
|         R|
|      html|
|javascript|
|       php|
|       css|
+----------+



In [7]:
df.printSchema()

root
 |-- language: string (nullable = true)



In [9]:
df.count(),len(df.columns)

(7, 1)

In [10]:
df.show(5)

+----------+
|  language|
+----------+
|       SQL|
|    Python|
|         R|
|      html|
|javascript|
+----------+
only showing top 5 rows



# Question 2 MPG dataset....let's do this!

In [34]:
mpg1 = data('mpg')

In [39]:
df = pd.DataFrame(mpg1)

In [35]:
mpg = spark.createDataFrame(mpg1)

In [36]:
mpg.describe().show()



+-------+------------+-----------------+------------------+-----------------+-----------------+----------+---+------------------+-----------------+----+-------+
|summary|manufacturer|            model|             displ|             year|              cyl|     trans|drv|               cty|              hwy|  fl|  class|
+-------+------------+-----------------+------------------+-----------------+-----------------+----------+---+------------------+-----------------+----+-------+
|  count|         234|              234|               234|              234|              234|       234|234|               234|              234| 234|    234|
|   mean|        null|             null| 3.471794871794872|           2003.5|5.888888888888889|      null|4.0|16.858974358974358|23.44017094017094|null|   null|
| stddev|        null|             null|1.2919590310839348|4.509646313320436|1.611534484684289|      null|0.0| 4.255945678889394|5.954643441166448|null|   null|
|    min|        audi|      4runne

                                                                                

In [15]:
mpg.count()

234

In [37]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [17]:
mpg.columns

['manufacturer',
 'model',
 'displ',
 'year',
 'cyl',
 'trans',
 'drv',
 'cty',
 'hwy',
 'fl',
 'class']

In [38]:
mpg.select(
    F.concat(F.lit('The '),F.col('year'),F.lit(' '),F.col('manufacturer'),F.lit(' '),F.col('model'),
             F.lit(' has a '),F.col('cyl'), F.lit(' cylinder engine')).alias('summary')
).show(5,truncate=False)

+----------------------------------------+
|summary                                 |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



In [50]:
df.trans.unique()

array(['auto(l5)', 'manual(m5)', 'manual(m6)', 'auto(av)', 'auto(s6)',
       'auto(l4)', 'auto(l3)', 'auto(l6)', 'auto(s5)', 'auto(s4)'],
      dtype=object)

In [48]:
df_trans = pd.DataFrame(df.trans.unique())

In [49]:
df_trans

Unnamed: 0,0
0,auto(l5)
1,manual(m5)
2,manual(m6)
3,auto(av)
4,auto(s6)
5,auto(l4)
6,auto(l3)
7,auto(l6)
8,auto(s5)
9,auto(s4)


In [52]:
manual = ['manual(m5)','manual(m6)']
mpg.select(
    mpg.trans,
    F.when((mpg.trans == 'manual(m5)')|(mpg.trans == 'manual(m6)'), 'MAN').\
    otherwise('AUTO').alias('trans_cat')
).show(5)

+----------+---------+
|     trans|trans_cat|
+----------+---------+
|  auto(l5)|     AUTO|
|manual(m5)|      MAN|
|manual(m6)|      MAN|
|  auto(av)|     AUTO|
|  auto(l5)|     AUTO|
+----------+---------+
only showing top 5 rows

