In [1]:
import numpy as np 
import pandas as pd 
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
# initialize spark session
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("ShortNSimple") \
            .getOrCreate()
spark

Data source: https://www.kaggle.com/danofer/zipcodes-county-fips-crosswalk

In [3]:
data = spark.createDataFrame(pd.read_csv("datasets/ZIP-COUNTY-FIPS_2017-06.csv"))
data = data.drop("STCOUNTYFP", "CLASSFP")
data.show(5, False)

+-----+--------------+-----+
|ZIP  |COUNTYNAME    |STATE|
+-----+--------------+-----+
|36003|Autauga County|AL   |
|36006|Autauga County|AL   |
|36067|Autauga County|AL   |
|36066|Autauga County|AL   |
|36703|Autauga County|AL   |
+-----+--------------+-----+
only showing top 5 rows



In [4]:
data = data.withColumn(
    "COUNTYNAME",
    F.regexp_replace(F.col("COUNTYNAME"), r'\s+', '')
)

data.show(10, False)

+-----+-------------+-----+
|ZIP  |COUNTYNAME   |STATE|
+-----+-------------+-----+
|36003|AutaugaCounty|AL   |
|36006|AutaugaCounty|AL   |
|36067|AutaugaCounty|AL   |
|36066|AutaugaCounty|AL   |
|36703|AutaugaCounty|AL   |
|36701|AutaugaCounty|AL   |
|36091|AutaugaCounty|AL   |
|36051|AutaugaCounty|AL   |
|36068|AutaugaCounty|AL   |
|36008|AutaugaCounty|AL   |
+-----+-------------+-----+
only showing top 10 rows



## concat

In [5]:
data = data.withColumn(
    "concat_primary_key",
    F.concat(F.col("ZIP"), F.col("COUNTYNAME"), F.col("STATE"))
)
data.show(5, False)

+-----+-------------+-----+--------------------+
|ZIP  |COUNTYNAME   |STATE|concat_primary_key  |
+-----+-------------+-----+--------------------+
|36003|AutaugaCounty|AL   |36003AutaugaCountyAL|
|36006|AutaugaCounty|AL   |36006AutaugaCountyAL|
|36067|AutaugaCounty|AL   |36067AutaugaCountyAL|
|36066|AutaugaCounty|AL   |36066AutaugaCountyAL|
|36703|AutaugaCounty|AL   |36703AutaugaCountyAL|
+-----+-------------+-----+--------------------+
only showing top 5 rows



## concat_ws

In [6]:
data = data.withColumn(
    "concat_ws_primary_key",
    F.concat_ws("_", F.col("ZIP"), F.col("COUNTYNAME"), F.col("STATE"))
)
data.show(5, False)

+-----+-------------+-----+--------------------+----------------------+
|ZIP  |COUNTYNAME   |STATE|concat_primary_key  |concat_ws_primary_key |
+-----+-------------+-----+--------------------+----------------------+
|36003|AutaugaCounty|AL   |36003AutaugaCountyAL|36003_AutaugaCounty_AL|
|36006|AutaugaCounty|AL   |36006AutaugaCountyAL|36006_AutaugaCounty_AL|
|36067|AutaugaCounty|AL   |36067AutaugaCountyAL|36067_AutaugaCounty_AL|
|36066|AutaugaCounty|AL   |36066AutaugaCountyAL|36066_AutaugaCounty_AL|
|36703|AutaugaCounty|AL   |36703AutaugaCountyAL|36703_AutaugaCounty_AL|
+-----+-------------+-----+--------------------+----------------------+
only showing top 5 rows

