In [2]:
from pyspark.sql.session import SparkSession

from pyspark.sql.types import (ArrayType, BooleanType, FloatType, IntegerType, StringType, StructField, StructType, TimestampType, DoubleType)

import pyspark.sql.functions as F

spark = SparkSession.builder.appName('fisrtSession')\
    .config('spark.master', 'local[4]')\
    .config('spark.executor.memory', '1gb')\
    .config('spark.shuffle.partitions', 1)\
    .getOrCreate()

schema = StructType([
      StructField("RecordNumber",IntegerType()),
      StructField("Zipcode",IntegerType()),
      StructField("ZipCodeType",StringType()),
      StructField("City",StringType()),
      StructField("State",StringType()),
      StructField("LocationType",StringType()),
      StructField("Lat",DoubleType()),
      StructField("Long",DoubleType()),
      StructField("Xaxis",IntegerType()),
      StructField("Yaxis",DoubleType()),
      StructField("Zaxis",DoubleType()),
      StructField("WorldRegion",StringType()),
      StructField("Country",StringType()),
      StructField("LocationText",StringType()),
      StructField("Location",StringType()),
      StructField("Decommisioned",BooleanType()),
      StructField("TaxReturnsFiled",StringType()),
      StructField("EstimatedPopulation",IntegerType()),
      StructField("TotalWages",IntegerType()),
      StructField("Notes",StringType())
  ])

df = spark.read.schema(schema)\
    .json('zipcodes.json')

df.registerTempTable('zipcodes')
df.head()

Row(RecordNumber=1, Zipcode=704, ZipCodeType='STANDARD', City='PARC PARQUE', State='PR', LocationType='NOT ACCEPTABLE', Lat=17.96, Long=-66.22, Xaxis=None, Yaxis=-0.87, Zaxis=0.3, WorldRegion='NA', Country='US', LocationText='Parc Parque, PR', Location='NA-US-PR-PARC PARQUE', Decommisioned=False, TaxReturnsFiled=None, EstimatedPopulation=None, TotalWages=None, Notes=None)

In [3]:
output = spark.sql('SELECT * FROM zipcodes')
output.show()

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|   Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|        Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-------------+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96| -66.22| null|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        false|           null|               null|      null|         null|
|           2|    70

In [4]:
output = spark.sql('SELECT RecordNumber, Notes FROM zipcodes')
output.show()

+------------+-------------+
|RecordNumber|        Notes|
+------------+-------------+
|           1|         null|
|           2|         null|
|          10|         null|
|       61391|         null|
|       61392|         null|
|       61393|         null|
|           4|         null|
|       39827|no NWS data, |
|       39828|         null|
|       49345|         null|
|       49346|         null|
|       49347|         null|
|       49348|         null|
|          10|         null|
|           3|         null|
|       54354|         null|
|       54355|         null|
|       54356|         null|
|       76511|         null|
|       76512|         null|
+------------+-------------+
only showing top 20 rows



In [5]:

output = spark.sql('SELECT RecordNumber FROM zipcodes WHERE RecordNumber > 10')
output.show()

+------------+
|RecordNumber|
+------------+
|       61391|
|       61392|
|       61393|
|       39827|
|       39828|
|       49345|
|       49346|
|       49347|
|       49348|
|       54354|
|       54355|
|       54356|
|       76511|
|       76512|
|       76513|
+------------+

