# Spark Basics

In [3]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('sparkBasics').master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

In [10]:
df = spark.read.json('people.json')

In [11]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
df.columns

['age', 'name']

In [13]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



# Defining Schema - For Strict Datatype Conformity

In [23]:
from pyspark.sql.types import StructField,IntegerType,StringType,StructType

data_schema = [StructField('age', IntegerType(), True),
              StructField('name', StringType(), True)]

final_struct = StructType(fields=data_schema)

df = spark.read.json('people.json', schema=final_struct)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [29]:
# Select a particular Column from the DF
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [31]:
# Select Multiple Columns in a DF
df.select(['name','age']).show()

+-------+----+
|   name| age|
+-------+----+
|Michael|null|
|   Andy|  30|
| Justin|  19|
+-------+----+



In [44]:
# Peek-a-boo 2 rows
df.head(2)

[Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039),
 Row(Date='2010-01-05', Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002)]

In [40]:
# Add a new column to the DF
df.withColumn('newAge',df['age']*2).show()

+----+-------+------+
| age|   name|newAge|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    60|
|  19| Justin|    38|
+----+-------+------+



In [42]:
# Show a column with its alias name
df.withColumnRenamed('age','new_age').show()

+-------+-------+
|new_age|   name|
+-------+-------+
|   null|Michael|
|     30|   Andy|
|     19| Justin|
+-------+-------+



In [43]:
df.createOrReplaceTempView('people')

results = spark.sql('SELECT * from people')

results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Working with CSV's

In [1]:
from pyspark.sql import SparkSession

In [13]:
#spark = SparkSession.builder.appName('ops').getOrCreate()
# To resolve the binding address issue when running locally
spark = SparkSession.builder.appName('ops').master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

In [20]:
df = spark.read.csv('appl_stock.csv',inferSchema=True, header=True)

In [28]:
df.head(2)[0]

Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

In [27]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [32]:
# select all the Volume and Close where Close < 500
df.filter(df['Close'] < 500).select(['Close','Volume']).show()

+------------------+---------+
|             Close|   Volume|
+------------------+---------+
|        214.009998|123432400|
|        214.379993|150476200|
|        210.969995|138040000|
|            210.58|119282800|
|211.98000499999998|111902700|
|210.11000299999998|115557400|
|        207.720001|148614900|
|        210.650002|151473000|
|            209.43|108223500|
|            205.93|148516900|
|        215.039995|182501900|
|            211.73|153038200|
|        208.069996|152038600|
|            197.75|220441900|
|        203.070002|266424900|
|        205.940001|466777500|
|        207.880005|430642100|
|        199.289995|293375600|
|        192.060003|311488100|
|        194.729998|187469100|
+------------------+---------+
only showing top 20 rows



In [37]:
# select with multiple conditions
df.filter((df['Close'] > 200) & ~(df['Open'] < 200)).show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [40]:
#get the results of the Row objects as Dictionary
result = df.filter(df['low']==197.16).collect()
result[0].asDict()

# Groupby and Aggregate Functions


In [45]:
df = spark.read.csv('sales_info.csv', inferSchema=True, header=True)

In [46]:
df.printSchema()
df.show()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+

