In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except ValueError:
        return None

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()


business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))

supervisor = sc.textFile("../Data/SF_business/supervisor_sf.csv")\
               .map(lambda x : x.split(","))\
               .map(lambda x: (IntegerSafe(x[0]), IntegerSafe(x[1])))
              

business_schema = StructType([ StructField("zip", IntegerType(), True),
                               StructField("name", StringType(), False),
                               StructField("street", StringType(), True),
                               StructField("city", StringType(), True),
                               StructField("state", StringType(), True)
                            ])

supervisor_schema = StructType([ StructField("zip", IntegerType(), False),
                    StructField("id", IntegerType(), False)
                    ])

business_df = ss.createDataFrame(business, business_schema)
supervisor_df = ss.createDataFrame(supervisor, supervisor_schema)

## Save Supservisor DataFrame as “Supervisor” and Business DataFrame as “Business”.

In [2]:
business_df.write.saveAsTable('Business')

In [3]:
supervisor_df.write.saveAsTable('Supervisor')

## And find supervisor id for "Holbert Deneice M"

In [4]:
ss.sql("select * from Business").show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94105| Barney & Barney Llc|1 Market St Steua...|San Francisco|   CA|
|94109|   Holbert Deneice M|  1426 California St|San Francisco|   CA|
| 6002|      Integralis Inc|310 West Newberry Rd|   Bloomfield|   CT|
|95603|       Mcadams Pat G|  10279 Mt Vernon Rd|       Auburn|   CA|
|95685|Young Gregory You...|14508 Shake Ridge Rd| Sutter+creek|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



In [5]:
ss.sql("select * from Supervisor").show(5)

+-----+---+
|  zip| id|
+-----+---+
|94115|  5|
|94116|  7|
|94116|  4|
|94117|  1|
|94117|  7|
+-----+---+
only showing top 5 rows



In [6]:
ss.sql("select id from Business JOIN Supervisor ON Business.zip = Supervisor.zip where name = 'Holbert Deneice M'")\
  .show()

+---+
| id|
+---+
|  5|
|  3|
|  6|
|  2|
+---+



## Kil the spark context and re-read

In [7]:
sc.stop()

In [9]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [10]:
ss.sql("select id from Business JOIN Supervisor ON Business.zip = Supervisor.zip where name = 'Holbert Deneice M'")\
  .show()

AnalysisException: 'Table or view not found: Business; line 1 pos 15'