In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Basic Operations").getOrCreate()

In [2]:
spark

In [3]:
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType,LongType

In [4]:
schema = StructType(
    [
        StructField(name="city", dataType=StringType(), nullable=True),
        StructField(name="country", dataType=StringType(), nullable=True),
        StructField(name="counts", dataType=LongType(),nullable=False)
    ]
) ## well..same as BigQuery..

In [5]:
rows = [
    Row("Auckland", "New Zeland", 10),
    Row("Sydney", "Australia", 53),
    Row("Wellington", "New Zeland", 5)
]

In [6]:
parallelizeRows = spark.sparkContext.parallelize(rows)


In [7]:
type(parallelizeRows)

pyspark.rdd.RDD

In [8]:
df = spark.createDataFrame(parallelizeRows, schema=schema)
df.show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|  Auckland|New Zeland|    10|
|    Sydney| Australia|    53|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [9]:
type(df) # spark dataframe

pyspark.sql.dataframe.DataFrame

In [10]:
# If reading from a .csv format file
# df = spark.read.csv()
# For JSON 
# df = spark.read.json()
# Create a lazy-view of spark dataframe
df.createOrReplaceTempView('my_table')
df_new = spark.sql("select * from my_table where city != 'Auckland'") ## using spark sql
type(df_new)

pyspark.sql.dataframe.DataFrame

In [11]:
df_new.show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|    Sydney| Australia|    53|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [12]:
# spark_dataframe => pandas_dataframe
df_pandas = df.toPandas()
df_pandas

Unnamed: 0,city,country,counts
0,Auckland,New Zeland,10
1,Sydney,Australia,53
2,Wellington,New Zeland,5


In [13]:
# Some native spark_dataframe functions
import pyspark.sql.functions as F

In [14]:
df.select("country").show(1)

+----------+
|   country|
+----------+
|New Zeland|
+----------+
only showing top 1 row



In [15]:
df.select(F.col("country")).show(1)

+----------+
|   country|
+----------+
|New Zeland|
+----------+
only showing top 1 row



In [16]:
df.select("country", "city").show(1)

+----------+--------+
|   country|    city|
+----------+--------+
|New Zeland|Auckland|
+----------+--------+
only showing top 1 row



In [17]:
df.select(F.expr("country AS destination")).show()

+-----------+
|destination|
+-----------+
| New Zeland|
|  Australia|
| New Zeland|
+-----------+



In [18]:
df.select(F.expr("country AS destination").alias("country_again")).show()

+-------------+
|country_again|
+-------------+
|   New Zeland|
|    Australia|
|   New Zeland|
+-------------+



In [20]:
df.selectExpr("country as new_country", "country").show()

+-----------+----------+
|new_country|   country|
+-----------+----------+
| New Zeland|New Zeland|
|  Australia| Australia|
| New Zeland|New Zeland|
+-----------+----------+



In [22]:
df.selectExpr("avg(counts) AS average_counts", "count(distinct(country)) as country_occurance").show()

+------------------+-----------------+
|    average_counts|country_occurance|
+------------------+-----------------+
|22.666666666666668|                2|
+------------------+-----------------+



In [23]:
df.select(F.expr("*"), F.lit(1).alias("One")).show()

+----------+----------+------+---+
|      city|   country|counts|One|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|  1|
|    Sydney| Australia|    53|  1|
|Wellington|New Zeland|     5|  1|
+----------+----------+------+---+



In [29]:
df_more_column = df.withColumn("one", F.lit("One"))
df_more_column.show()

+----------+----------+------+---+
|      city|   country|counts|one|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|One|
|    Sydney| Australia|    53|One|
|Wellington|New Zeland|     5|One|
+----------+----------+------+---+



In [36]:
# Renaming Column & change content
df_more_column.withColumn("two", F.expr("one")).withColumn("two", F.lit(2)).show()

+----------+----------+------+---+---+
|      city|   country|counts|one|two|
+----------+----------+------+---+---+
|  Auckland|New Zeland|    10|One|  2|
|    Sydney| Australia|    53|One|  2|
|Wellington|New Zeland|     5|One|  2|
+----------+----------+------+---+---+



In [37]:
df_more_column.withColumnRenamed("one","ONE").show()

+----------+----------+------+---+
|      city|   country|counts|ONE|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|One|
|    Sydney| Australia|    53|One|
|Wellington|New Zeland|     5|One|
+----------+----------+------+---+

