In [None]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .master('spark://10.0.0.4:7077') \
      .appName('spark-basic') \
      .getOrCreate() 

In [6]:
# Read CSV file into table
spark.read.option("header",True).csv("zipcode.csv").createOrReplaceTempView("Zipcodes")

## Select Columns

In [7]:
spark.sql("SELECT country, city, zipcode, state FROM ZIPCODES").show(5)

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
+-------+-------------------+-------+-----+
only showing top 5 rows



In [18]:
spark.sql("SELECT country, city, zipcode, state FROM ZIPCODES").show(truncate=False)

+-------+-------------------+-------+-----+
|country|city               |zipcode|state|
+-------+-------------------+-------+-----+
|US     |PARC PARQUE        |704    |PR   |
|US     |PASEO COSTA DEL SUR|704    |PR   |
|US     |BDA SAN LUIS       |709    |PR   |
|US     |HOLT               |32564  |FL   |
|US     |HOMOSASSA          |34487  |FL   |
|US     |CINGULAR WIRELESS  |76166  |TX   |
|US     |FORT WORTH         |76177  |TX   |
|US     |FT WORTH           |76177  |TX   |
|US     |SPRUCE PINE        |35585  |AL   |
|US     |ASH HILL           |27007  |NC   |
|US     |URB EUGENE RICE    |704    |PR   |
|US     |MESA               |85209  |AZ   |
|US     |MESA               |85210  |AZ   |
|US     |HILLIARD           |32046  |FL   |
|US     |HOLDER             |34445  |FL   |
|US     |SECT LANAUSSE      |704    |PR   |
|US     |SPRING GARDEN      |36275  |AL   |
|US     |SPRINGVILLE        |35146  |AL   |
|US     |ASHEBORO           |27203  |NC   |
|US     |ASHEBORO           |272

## Select Columns based on WHERE condition

In [16]:
spark.sql("SELECT  country, city, zipcode, state FROM ZIPCODES WHERE state = 'AZ'").show(5)

+-------+----+-------+-----+
|country|city|zipcode|state|
+-------+----+-------+-----+
|     US|MESA|  85209|   AZ|
|     US|MESA|  85210|   AZ|
+-------+----+-------+-----+



In [17]:
df2 = spark.sql("SELECT  country, city, zipcode, state FROM ZIPCODES WHERE state = 'AZ'")
df2.collect()

[Row(country='US', city='MESA', zipcode='85209', state='AZ'),
 Row(country='US', city='MESA', zipcode='85210', state='AZ')]

## Select Columns based on WHERE in condition

In [13]:
spark.sql("SELECT  country, city, zipcode, state FROM ZIPCODES WHERE state in ('PR','AZ','FL') order by state ").show(10)

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|               MESA|  85209|   AZ|
|     US|               MESA|  85210|   AZ|
|     US|               HOLT|  32564|   FL|
|     US|             HOLDER|  34445|   FL|
|     US|          HOMOSASSA|  34487|   FL|
|     US|           HILLIARD|  32046|   FL|
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|    URB EUGENE RICE|    704|   PR|
+-------+-------------------+-------+-----+
only showing top 10 rows



## Aggregated data for grouped 

In [15]:
spark.sql("SELECT state, count(*) as count FROM ZIPCODES GROUP BY state").show()

+-----+-----+
|state|count|
+-----+-----+
|   AZ|    2|
|   NC|    3|
|   AL|    3|
|   TX|    3|
|   FL|    4|
|   PR|    5|
+-----+-----+



In [4]:
df = spark.read.option("header",True).csv("123.csv")

df.printSchema()
df.show()

root
 |-- Name: string (nullable = true)
 |--  SName: string (nullable = true)
 |-- TName: string (nullable = true)
 |-- DOB: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Sal: string (nullable = true)

+-------+------+--------+----------+------+----+
|   Name| SName|   TName|       DOB|Gender| Sal|
+-------+------+--------+----------+------+----+
|  James|  null|   Smith|1991-04-01|     M|3000|
|Michael|  Rose|    null|2000-05-19|     M|4000|
| Robert|  null|Williams|1978-09-05|     M|4000|
|  Maria|  Anne|   Jones|1967-12-01|     F|4000|
|    Jen|  Mary|   Brown|1980-02-17|     F|  -1|
+-------+------+--------+----------+------+----+

