In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Spark Basic Operations").getOrCreate()

In [2]:
spark

In [3]:
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType,LongType

In [4]:
schema = StructType(
    [
        StructField(name="city", dataType=StringType(), nullable=True),
        StructField(name="country", dataType=StringType(), nullable=True),
        StructField(name="counts", dataType=LongType(),nullable=False)
    ]
) ## well..same as BigQuery..

In [5]:
rows = [
    Row("Auckland", "New Zeland", 10),
    Row("Sydney", "Australia", 53),
    Row("Wellington", "New Zeland", 5)
]

In [6]:
parallelizeRows = spark.sparkContext.parallelize(rows)


In [7]:
type(parallelizeRows)

pyspark.rdd.RDD

In [8]:
df = spark.createDataFrame(parallelizeRows, schema=schema)
df.show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|  Auckland|New Zeland|    10|
|    Sydney| Australia|    53|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [9]:
type(df) # spark dataframe

pyspark.sql.dataframe.DataFrame

In [10]:
# If reading from a .csv format file
# df = spark.read.csv()
# For JSON 
# df = spark.read.json()
# Create a lazy-view of spark dataframe
df.createOrReplaceTempView('my_table')
df_new = spark.sql("select * from my_table where city != 'Auckland'") ## using spark sql
type(df_new)

pyspark.sql.dataframe.DataFrame

In [11]:
df_new.show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|    Sydney| Australia|    53|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [12]:
# spark_dataframe => pandas_dataframe
df_pandas = df.toPandas()
df_pandas

Unnamed: 0,city,country,counts
0,Auckland,New Zeland,10
1,Sydney,Australia,53
2,Wellington,New Zeland,5


In [13]:
# Some native spark_dataframe functions
import pyspark.sql.functions as F

In [14]:
# F actually is tranforming expr to sql
# eg.
print(F.col('city') != "Auckland")
print(F.expr("length(country)"))

Column<b'(NOT (city = Auckland))'>
Column<b'length(country)'>


In [15]:
df.columns # type => list

['city', 'country', 'counts']

In [16]:
df.withColumn('country_len', F.expr("length(country)")).show()

+----------+----------+------+-----------+
|      city|   country|counts|country_len|
+----------+----------+------+-----------+
|  Auckland|New Zeland|    10|         10|
|    Sydney| Australia|    53|          9|
|Wellington|New Zeland|     5|         10|
+----------+----------+------+-----------+



In [17]:
# select column by name
df.select("country").show(1)

+----------+
|   country|
+----------+
|New Zeland|
+----------+
only showing top 1 row



In [18]:
#also can be ref
df.select(F.col("country")).show(1)

+----------+
|   country|
+----------+
|New Zeland|
+----------+
only showing top 1 row



In [19]:
df.select("country", "city").show(1)

+----------+--------+
|   country|    city|
+----------+--------+
|New Zeland|Auckland|
+----------+--------+
only showing top 1 row



In [20]:
df.select(F.expr("country AS destination")).show() # renaming method 1/3

+-----------+
|destination|
+-----------+
| New Zeland|
|  Australia|
| New Zeland|
+-----------+



In [21]:
# df.select(F.expr("country AS destination").alias("country_again")).show()
# .col is a simple expr, so it equal to
df.select(F.col("country").alias("destination")).show() # renaming method 2/3

+-----------+
|destination|
+-----------+
| New Zeland|
|  Australia|
| New Zeland|
+-----------+



In [22]:
df.select("country").withColumnRenamed("country", "destination").show() # renaming method 3/3

+-----------+
|destination|
+-----------+
| New Zeland|
|  Australia|
| New Zeland|
+-----------+



In [23]:
# upper case
df.select("country").withColumn("country", F.upper(F.col("country"))).show()

+----------+
|   country|
+----------+
|NEW ZELAND|
| AUSTRALIA|
|NEW ZELAND|
+----------+



In [24]:
df.selectExpr("country as new_country", "country").show()

+-----------+----------+
|new_country|   country|
+-----------+----------+
| New Zeland|New Zeland|
|  Australia| Australia|
| New Zeland|New Zeland|
+-----------+----------+



In [25]:
df.selectExpr("avg(counts) AS average_counts", "count(distinct(country)) as country_occurance").show()

+------------------+-----------------+
|    average_counts|country_occurance|
+------------------+-----------------+
|22.666666666666668|                2|
+------------------+-----------------+



In [26]:
df.select(F.expr("*"), F.lit(1).alias("One")).show()

+----------+----------+------+---+
|      city|   country|counts|One|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|  1|
|    Sydney| Australia|    53|  1|
|Wellington|New Zeland|     5|  1|
+----------+----------+------+---+



In [27]:
df_more_column = df.withColumn("one", F.lit("One"))
df_more_column.show()

+----------+----------+------+---+
|      city|   country|counts|one|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|One|
|    Sydney| Australia|    53|One|
|Wellington|New Zeland|     5|One|
+----------+----------+------+---+



In [28]:
# Renaming Column & change content
df_more_column.withColumn("two", F.expr("one")).withColumn("two", F.lit(2)).show()

+----------+----------+------+---+---+
|      city|   country|counts|one|two|
+----------+----------+------+---+---+
|  Auckland|New Zeland|    10|One|  2|
|    Sydney| Australia|    53|One|  2|
|Wellington|New Zeland|     5|One|  2|
+----------+----------+------+---+---+



In [29]:
df_more_column.withColumnRenamed("one","ONE").show()

+----------+----------+------+---+
|      city|   country|counts|ONE|
+----------+----------+------+---+
|  Auckland|New Zeland|    10|One|
|    Sydney| Australia|    53|One|
|Wellington|New Zeland|     5|One|
+----------+----------+------+---+



In [30]:
# Removing
df_old = df_more_column.drop('one')
df_old.show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|  Auckland|New Zeland|    10|
|    Sydney| Australia|    53|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [31]:
# spark dataframe filtering
df_old.filter((F.col("counts") < 20) & (F.col("city") == "Auckland")).show()

+--------+----------+------+
|    city|   country|counts|
+--------+----------+------+
|Auckland|New Zeland|    10|
+--------+----------+------+



In [32]:
df_old.where("counts < 20 and city = 'Auckland'").show()

+--------+----------+------+
|    city|   country|counts|
+--------+----------+------+
|Auckland|New Zeland|    10|
+--------+----------+------+



In [33]:
df_old.filter(F.col("counts") < 20).where("city = 'Auckland'").show()

+--------+----------+------+
|    city|   country|counts|
+--------+----------+------+
|Auckland|New Zeland|    10|
+--------+----------+------+



In [34]:
# select distinct
df_old.select("city").distinct().show()

+----------+
|      city|
+----------+
|Wellington|
|  Auckland|
|    Sydney|
+----------+



In [35]:
df_old.sample(withReplacement=False, fraction=0.2, seed=7).show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|Wellington|New Zeland|     5|
+----------+----------+------+



In [36]:
df_test = spark.range(1000)
df_test.sample(0.5).count()
#    .. note:: This is not guaranteed to provide exactly the fraction specified of the total
#        count of the given :class:`DataFrame`.

488

In [37]:
help(df_test.randomSplit)

Help on method randomSplit in module pyspark.sql.dataframe:

randomSplit(weights, seed=None) method of pyspark.sql.dataframe.DataFrame instance
    Randomly splits this :class:`DataFrame` with the provided weights.
    
    :param weights: list of doubles as weights with which to split the :class:`DataFrame`.
        Weights will be normalized if they don't sum up to 1.0.
    :param seed: The seed for sampling.
    
    >>> splits = df4.randomSplit([1.0, 2.0], 24)
    >>> splits[0].count()
    2
    
    >>> splits[1].count()
    2
    
    .. versionadded:: 1.4



In [38]:
df_split_test = df_test.randomSplit([0.4, 0.6])
print(df_split_test[0].count())
print(df_split_test[1].count())

387
613


In [39]:
# Sorting
df_old.sort("counts").show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|Wellington|New Zeland|     5|
|  Auckland|New Zeland|    10|
|    Sydney| Australia|    53|
+----------+----------+------+



In [40]:
df_old.orderBy("counts").show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|Wellington|New Zeland|     5|
|  Auckland|New Zeland|    10|
|    Sydney| Australia|    53|
+----------+----------+------+



In [41]:
df_old.orderBy(F.desc("counts")).show()

+----------+----------+------+
|      city|   country|counts|
+----------+----------+------+
|    Sydney| Australia|    53|
|  Auckland|New Zeland|    10|
|Wellington|New Zeland|     5|
+----------+----------+------+



In [42]:
help(F.expr)

Help on function expr in module pyspark.sql.functions:

expr(str)
    Parses the expression string into the column that it represents
    
    >>> df.select(expr("length(name)")).collect()
    [Row(length(name)=5), Row(length(name)=3)]
    
    .. versionadded:: 1.5



In [43]:
# limiting extract result
df_old.limit(2).show()
# this is faster than
# df_old.show(2)

+--------+----------+------+
|    city|   country|counts|
+--------+----------+------+
|Auckland|New Zeland|    10|
|  Sydney| Australia|    53|
+--------+----------+------+

