# Manipulating DataFrames in the real world


In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext("local", "pyspark-shell")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## DataFrame column operations

Dataframes are modified through the use of transformations.

voter_df.filter(voter_df.name.like("M%")) Returns rows where name starts with "M"

filter() is analogous to the WHERE calause in SQL. Spark includes a where which can be used instead of filter.

Select() returns the columns requested from the DataFrame.

withColumn method creates a new column in the DataFrame.

drop() method to remove a column from a DataFrame.

Filtering lets us use only the data matching our desired result. We can remove null values, remove odd entries, split a dataframe containing combined data. You can negate results using ~ character.

To convert data type use the cast() function in the withColumn method to perform the conversion to an IntegerType().

ArrayType() columns are analogous to lists in python. You can use .size() to return length of arrayType() column, .getItem(index) to retrieve a specific item at index.

### Filtering column content with Python

In [61]:
import pyspark.sql.functions as F

voter_df = spark.read.csv("DallasCouncilVoters.csv", header = True)
print(voter_df.select("VOTER_NAME").distinct().show(5, truncate=False))

voter_df = voter_df.filter("length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20")
voter_df = voter_df.filter(~F.col("VOTER_NAME").contains("_")) # or ~voter_df.VOTER_NAME.contains("_")
voter_df.select("VOTER_NAME").distinct().show(40, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|VOTER_NAME                                                                                                                                                                                                                                                                                                                                                                                                   |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Modifying DataFrame columns

In [70]:
voter_df = voter_df.withColumn("splits", F.split(voter_df.VOTER_NAME, "\s+"))
voter_df = voter_df.withColumn("first_name", voter_df.splits.getItem(0))
voter_df = voter_df.withColumn("last_name", voter_df.splits.getItem(F.size("splits")-1))
voter_df = voter_df.drop("splits")
voter_df.show()

+----------+-------------+-------------------+----------+---------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|
+----------+-------------+-------------------+----------+---------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|
|02/08/2017|Councilmember|       Scott Griggs|     Scott|   Griggs|
|02/08/2017|Councilmember|   B. Adam  McGough|        B.|  McGough|
|02/08/2017|Councilmember|       Lee Kleinman|       Lee| Kleinman|
|02/08/2017|Councilmember|      Sandy Greyson|     Sandy|  Greyson|
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|
|02/08/2017|Councilmember| Philip T. Kingston|  

## Conditional DataFrame column operations

Using when and otherwise while doing changes. We can actually add an extra argument to select select() method using the when() clause. You can chain multiple when statements together. Otherwise is analogous to the else statement.

### when() example

In [73]:
voter_df = spark.read.csv("DallasCouncilVoters.csv", header = True)

voter_df = voter_df.withColumn("random_val", F.when(voter_df.TITLE == "Councilmember", F.rand()))
voter_df.show()

+----------+-------------+-------------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|         random_val|
+----------+-------------+-------------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates| 0.6607567044188334|
|02/08/2017|Councilmember| Philip T. Kingston|0.24255626131383023|
|02/08/2017|        Mayor|Michael S. Rawlings|               null|
|02/08/2017|Councilmember|       Adam Medrano|0.39685642401523313|
|02/08/2017|Councilmember|       Casey Thomas|  0.701103007433525|
|02/08/2017|Councilmember|Carolyn King Arnold|  0.932568022920098|
|02/08/2017|Councilmember|       Scott Griggs| 0.5618972722455521|
|02/08/2017|Councilmember|   B. Adam  McGough| 0.4391268430174863|
|02/08/2017|Councilmember|       Lee Kleinman|0.20606861682352284|
|02/08/2017|Councilmember|      Sandy Greyson|0.27350620949942717|
|02/08/2017|Councilmember|  Jennifer S. Gates| 0.6611069475539054|
|02/08/2017|Councilmember| Philip T. Kingston| 0.9245051629882

### When / Otherwise


In [75]:
voter_df = spark.read.csv("DallasCouncilVoters.csv", header = True)

voter_df = voter_df.withColumn("random_val", F.when(voter_df.TITLE == "Councilmember", F.rand())
                               .when(voter_df.TITLE == "Mayor", 2)
                               .otherwise(0))
voter_df.show()
voter_df.filter(voter_df.random_val == 0).show()

+----------+-------------+-------------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|         random_val|
+----------+-------------+-------------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|0.09477909415548469|
|02/08/2017|Councilmember| Philip T. Kingston| 0.9330434703312289|
|02/08/2017|        Mayor|Michael S. Rawlings|                2.0|
|02/08/2017|Councilmember|       Adam Medrano|  0.617847477143871|
|02/08/2017|Councilmember|       Casey Thomas|0.28639496340976445|
|02/08/2017|Councilmember|Carolyn King Arnold| 0.5061212467016271|
|02/08/2017|Councilmember|       Scott Griggs| 0.8455172813050197|
|02/08/2017|Councilmember|   B. Adam  McGough|0.13108658585491573|
|02/08/2017|Councilmember|       Lee Kleinman| 0.2797485628763148|
|02/08/2017|Councilmember|      Sandy Greyson| 0.1790295223822026|
|02/08/2017|Councilmember|  Jennifer S. Gates| 0.4374651278435502|
|02/08/2017|Councilmember| Philip T. Kingston| 0.4738650530361

### User defined functions

The name of the method you defined and the Spark data type it will return.

### Using user defined functions in Spark

In [99]:
from pyspark.sql.types import StringType
def getFirstAndMiddle(names):
    return " ".join(names[:-1])

udfFirstAndMiddle = F.udf(getFirstAndMiddle, StringType())

voter_df = spark.read.csv("DallasCouncilVoters.csv", header = True)
voter_df = voter_df.filter("length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20")
voter_df = voter_df.filter(~F.col("VOTER_NAME").contains("_")) # or ~voter_df.VOTER_NAME.contains("_")
voter_df = voter_df.withColumn("splits", F.split(voter_df.VOTER_NAME, "\s+"))

voter_df = voter_df.withColumn("first_and_middle_name", udfFirstAndMiddle(voter_df.splits))
voter_df.show()

+----------+-------------+-------------------+--------------------+---------------------+
|      DATE|        TITLE|         VOTER_NAME|              splits|first_and_middle_name|
+----------+-------------+-------------------+--------------------+---------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|[Jennifer, S., Ga...|          Jennifer S.|
|02/08/2017|Councilmember| Philip T. Kingston|[Philip, T., King...|            Philip T.|
|02/08/2017|        Mayor|Michael S. Rawlings|[Michael, S., Raw...|           Michael S.|
|02/08/2017|Councilmember|       Adam Medrano|     [Adam, Medrano]|                 Adam|
|02/08/2017|Councilmember|       Casey Thomas|     [Casey, Thomas]|                Casey|
|02/08/2017|Councilmember|Carolyn King Arnold|[Carolyn, King, A...|         Carolyn King|
|02/08/2017|Councilmember|       Scott Griggs|     [Scott, Griggs]|                Scott|
|02/08/2017|Councilmember|   B. Adam  McGough| [B., Adam, McGough]|              B. Adam|
|02/08/201

### Partitioning and lazy processing

Transformations are lazy. It defines what should be done to a DataFrame rather than actually doing it. They are executed when an action is run.

Unique ID fields can create some undue bottlenecks due to distributed platform such as Spark. Spark can create unique ID's but they are not sequential.

### Adding an ID Field

In [107]:
df = spark.read.csv("DallasCouncilVotes.csv", header = True)

voter_df = df.select("VOTER NAME").distinct()
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df.count())

voter_df = voter_df.withColumn("ROW_ID", F.monotonically_increasing_id())
voter_df.orderBy(voter_df.ROW_ID.desc()).show()


There are 36 rows in the voter_df DataFrame.

+--------------------+-------------+
|          VOTER NAME|       ROW_ID|
+--------------------+-------------+
|        Lee Kleinman|1709396983808|
|  the  final  201...|1700807049217|
|         Erik Wilson|1700807049216|
|  the  final   20...|1683627180032|
| Carolyn King Arnold|1632087572480|
| Rickey D.  Callahan|1597727834112|
|   the   final  2...|1443109011456|
|    Monica R. Alonzo|1382979469312|
|     Lee M. Kleinman|1228360646656|
|   Jennifer S. Gates|1194000908288|
| Philip T.  Kingston|1185410973696|
|   Dwaine R. Caraway|1142461300736|
|  Rickey D. Callahan|1125281431553|
|  Philip T. Kingston|1125281431552|
|        Omar Narvaez|1116691496960|
|    B. Adam  McGough|1056561954816|
|  the  final  201...|1047972020224|
|   Tiffinni A. Young|1005022347264|
|  Jennifer S.  Gates| 962072674304|
|        Mark Clayton| 944892805120|
+--------------------+-------------+
only showing top 20 rows



### IDs with different partitions

In [118]:
df = spark.read.csv("DallasCouncilVotes.csv", header = True)

voter_df = df.select("VOTER NAME").distinct()
voter_df_single = df.select("VOTER NAME").distinct().repartition(1)

print("\nThere are %d partitions in the voter_df DataFrame.\n" % voter_df.rdd.getNumPartitions())
print("\nThere are %d partitions in the voter_df DataFrame.\n" % voter_df_single.rdd.getNumPartitions())

voter_df = voter_df.withColumn("ROW_ID", F.monotonically_increasing_id())
voter_df_single = voter_df_single.withColumn("ROW_ID", F.monotonically_increasing_id())

voter_df.orderBy(voter_df.ROW_ID.desc()).show(10)
voter_df_single.orderBy(voter_df_single.ROW_ID.desc()).show(10)


There are 200 partitions in the voter_df DataFrame.


There are 1 partitions in the voter_df DataFrame.

+--------------------+-------------+
|          VOTER NAME|       ROW_ID|
+--------------------+-------------+
|        Lee Kleinman|1709396983808|
|  the  final  201...|1700807049217|
|         Erik Wilson|1700807049216|
|  the  final   20...|1683627180032|
| Carolyn King Arnold|1632087572480|
| Rickey D.  Callahan|1597727834112|
|   the   final  2...|1443109011456|
|    Monica R. Alonzo|1382979469312|
|     Lee M. Kleinman|1228360646656|
|   Jennifer S. Gates|1194000908288|
+--------------------+-------------+
only showing top 10 rows

+--------------------+------+
|          VOTER NAME|ROW_ID|
+--------------------+------+
|        Lee Kleinman|    35|
|  the  final  201...|    34|
|         Erik Wilson|    33|
|  the  final   20...|    32|
| Carolyn King Arnold|    31|
| Rickey D.  Callahan|    30|
|   the   final  2...|    29|
|    Monica R. Alonzo|    28|
|     Lee M. Kleinma

### More ID tricks

In [130]:
voter_df_march = spark.read.csv("DallasCouncilVoters.csv", header = True)
voter_df_march = voter_df_march.select("VOTER_NAME").distinct()
voter_df_march = voter_df_march.withColumn("ROW_ID", F.monotonically_increasing_id())

previous_max_ID = voter_df_march.select("ROW_ID").rdd.max()[0]

voter_df_april = spark.read.csv("DallasCouncilVoters.csv", header = True)
voter_df_april = voter_df_april.select("VOTER_NAME").distinct()
voter_df_april = voter_df_april.withColumn("ROW_ID", F.monotonically_increasing_id() + previous_max_ID)

voter_df_march.select('ROW_ID').show()
voter_df_april.select('ROW_ID').show()

+-------------+
|       ROW_ID|
+-------------+
|   8589934592|
|  25769803776|
|  34359738368|
|  42949672960|
|  51539607552|
| 103079215104|
| 111669149696|
| 231928233984|
| 240518168576|
| 352187318272|
| 360777252864|
| 395136991232|
| 601295421440|
| 670014898176|
| 824633720832|
| 850403524608|
| 944892805120|
| 962072674304|
|1005022347264|
|1047972020224|
+-------------+
only showing top 20 rows

+-------------+
|       ROW_ID|
+-------------+
|1717986918400|
|1735166787584|
|1743756722176|
|1752346656768|
|1760936591360|
|1812476198912|
|1821066133504|
|1941325217792|
|1949915152384|
|2061584302080|
|2070174236672|
|2104533975040|
|2310692405248|
|2379411881984|
|2534030704640|
|2559800508416|
|2654289788928|
|2671469658112|
|2714419331072|
|2757369004032|
+-------------+
only showing top 20 rows

