In [1]:
import datetime as dt
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.types as t
from pyspark.sql.functions import udf, col, when, isnan, count, upper, first, date_trunc
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType

In [2]:
def create_spark_session():
    
    """
    Get or Create a Spark Session
    
    return: Spark Session
    """
    
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
        .enableHiveSupport().getOrCreate()
    return spark

spark = create_spark_session()

In [3]:
spark

In [4]:
addr_lookup_df = spark.read.csv("lookup/addr.csv", header='true', inferSchema='true')
addr_lookup_df.printSchema()

root
 |-- i94addr: string (nullable = true)
 |-- state: string (nullable = true)



In [5]:
addr_lookup_df.show(5)

+-------+----------+
|i94addr|     state|
+-------+----------+
|     AL|   ALABAMA|
|     AK|    ALASKA|
|     AZ|   ARIZONA|
|     AR|  ARKANSAS|
|     CA|CALIFORNIA|
+-------+----------+
only showing top 5 rows



In [6]:
visa_lookup_df = spark.read.csv("lookup/visa.csv", header='true', inferSchema='true')
visa_lookup_df.printSchema()

root
 |-- i94visa: integer (nullable = true)
 |-- visa: string (nullable = true)



In [7]:
visa_lookup_df.show()

+-------+---------+
|i94visa|     visa|
+-------+---------+
|      1| Business|
|      2| Pleasure|
|      3|  Student|
+-------+---------+



In [8]:
mode_lookup_df = spark.read.csv("lookup/mode.csv", header='true', inferSchema='true')
mode_lookup_df.printSchema()

root
 |-- i94mode: integer (nullable = true)
 |-- mode: string (nullable = true)



In [9]:
mode_lookup_df.show()

+-------+------------+
|i94mode|        mode|
+-------+------------+
|      1|         Air|
|      2|         Sea|
|      3|        Land|
|      9|Not reported|
+-------+------------+



In [10]:
port_lookup_df = spark.read.csv("lookup/port.csv", header='true', inferSchema='true')
port_lookup_df.printSchema()

root
 |-- i94port: string (nullable = true)
 |-- portname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- addr: string (nullable = true)



In [11]:
port_lookup_df.show(5)

+-------+--------------------+-------------+----+
|i94port|            portname|      country|addr|
+-------+--------------------+-------------+----+
|    ALC|               ALCAN|UNITED STATES|  AK|
|    ANC|           ANCHORAGE|UNITED STATES|  AK|
|    BAR|BAKERAAF-BAKERISLAND|UNITED STATES|  AK|
|    DAC|        DALTONSCACHE|UNITED STATES|  AK|
|    PIZ|  DEWSTATIONPTLAYDEW|UNITED STATES|  AK|
+-------+--------------------+-------------+----+
only showing top 5 rows



In [12]:
country_lookup_df = spark.read.csv("lookup/country.csv", header='true', inferSchema='true')
country_lookup_df.printSchema()

root
 |-- i94country: integer (nullable = true)
 |-- country: string (nullable = true)



In [13]:
country_lookup_df.show(5)

+----------+-----------+
|i94country|    country|
+----------+-----------+
|       582|     MEXICO|
|       236|AFGHANISTAN|
|       101|    ALBANIA|
|       316|    ALGERIA|
|       102|    ANDORRA|
+----------+-----------+
only showing top 5 rows



In [14]:
# Process cities demographics file
cities_df = spark.read.csv("data/us-cities-demographics.csv", sep=';', header=True)

# Creating 'cities_pivot_df' dataset
cities_pivot_df = cities_df.select("city","state code","Race","count") \
                                  .groupby(cities_df.City, "state code") \
                                  .pivot("Race") \
                                  .agg(first("Count"))

drop_cols_list = ["Number of Veterans","Race","Count"]

# Drop columns we don't need and drop duplicate rows
cities_final_df = cities_df.drop(*drop_cols_list).dropDuplicates()

# Finally saving (committing) joined cities_final_df dataset
cities_final_df = cities_final_df.join(cities_pivot_df, ["City","state code"])

# Change `state code` column name to `state_code` and other similar problems to avoid parquet complications
cities_final_df = cities_final_df.withColumnRenamed("State Code", "State_Code") \
                                     .withColumnRenamed("Median Age", "Median_Age") \
                                     .withColumnRenamed("Male Population", "Male_Population") \
                                     .withColumnRenamed("Female Population", "Female_Population") \
                                     .withColumnRenamed("Total Population", "Total_Population") \
                                     .withColumnRenamed("Foreign-born", "Foreign_born") \
                                     .withColumnRenamed("Average Household Size", "Avg_Household_Size") \
                                     .withColumnRenamed("American Indian and Alaska Native", "Native_Population") \
                                     .withColumnRenamed("Asian", "Asian_Population") \
                                     .withColumnRenamed("Black or African-American", "African_American_Population") \
                                     .withColumnRenamed("Hispanic or Latino", "Hispanic_Population") \
                                     .withColumnRenamed("White", "White_population")

In [15]:
cities_final_df.printSchema()

root
 |-- City: string (nullable = true)
 |-- State_Code: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median_Age: string (nullable = true)
 |-- Male_Population: string (nullable = true)
 |-- Female_Population: string (nullable = true)
 |-- Total_Population: string (nullable = true)
 |-- Foreign_born: string (nullable = true)
 |-- Avg_Household_Size: string (nullable = true)
 |-- Native_Population: string (nullable = true)
 |-- Asian_Population: string (nullable = true)
 |-- African_American_Population: string (nullable = true)
 |-- Hispanic_Population: string (nullable = true)
 |-- White_population: string (nullable = true)



In [16]:
cities_final_df.show(5)

+---------------+----------+-----------+----------+---------------+-----------------+----------------+------------+------------------+-----------------+----------------+---------------------------+-------------------+----------------+
|           City|State_Code|      State|Median_Age|Male_Population|Female_Population|Total_Population|Foreign_born|Avg_Household_Size|Native_Population|Asian_Population|African_American_Population|Hispanic_Population|White_population|
+---------------+----------+-----------+----------+---------------+-----------------+----------------+------------+------------------+-----------------+----------------+---------------------------+-------------------+----------------+
|Highlands Ranch|        CO|   Colorado|      39.6|          49186|            53281|          102467|        8827|              2.72|             1480|            5650|                       1779|               8393|           94499|
|           Kent|        WA| Washington|      33.4|         

In [17]:
cities_final_df.write.mode('overwrite').parquet("output/us_cities_demographics") 

In [18]:
cities_outputdf = spark.read.parquet("output/us_cities_demographics")
cities_outputdf.printSchema()

root
 |-- City: string (nullable = true)
 |-- State_Code: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Median_Age: string (nullable = true)
 |-- Male_Population: string (nullable = true)
 |-- Female_Population: string (nullable = true)
 |-- Total_Population: string (nullable = true)
 |-- Foreign_born: string (nullable = true)
 |-- Avg_Household_Size: string (nullable = true)
 |-- Native_Population: string (nullable = true)
 |-- Asian_Population: string (nullable = true)
 |-- African_American_Population: string (nullable = true)
 |-- Hispanic_Population: string (nullable = true)
 |-- White_population: string (nullable = true)



In [19]:
# Read i94 non-immigration dataset
immigration_df=spark.read.parquet("data/sas_data") \
            .select("cicid","i94yr","i94mon","i94cit","i94res","i94port","i94mode","i94addr","i94bir","i94visa","biryear","gender","visatype","arrdate","depdate")
immigration_df.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- biryear: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- depdate: double (nullable = true)



In [20]:
immigration_df.count()

879748

In [21]:
immigration_fmt_df = immigration_df.withColumn("cicid", immigration_df.cicid.cast(IntegerType())) \
                                   .withColumn("i94res", immigration_df.i94res.cast(IntegerType())) \
                                   .withColumn("i94yr", immigration_df.i94yr.cast(IntegerType())) \
                                   .withColumn("i94mon", immigration_df.i94mon.cast(IntegerType())) \
                                   .withColumn("i94mode", immigration_df.i94mode.cast(IntegerType())) \
                                   .withColumn("i94bir", immigration_df.i94bir.cast(IntegerType())) \
                                   .withColumn("i94visa", immigration_df.i94visa.cast(IntegerType())) \
                                   .withColumn("biryear", immigration_df.biryear.cast(IntegerType())) \
                                   .withColumn("i94cit", immigration_df.i94cit.cast(IntegerType())) \
                                   .withColumn("arrdate", immigration_df.arrdate.cast(IntegerType())) \
                                   .withColumn("depdate", immigration_df.depdate.cast(IntegerType())) \
                                   .withColumnRenamed("i94res", "i94country")

In [22]:
immigration_fmt_df.show(5)

+------+-----+------+------+----------+-------+-------+-------+------+-------+-------+------+--------+-------+-------+
| cicid|i94yr|i94mon|i94cit|i94country|i94port|i94mode|i94addr|i94bir|i94visa|biryear|gender|visatype|arrdate|depdate|
+------+-----+------+------+----------+-------+-------+-------+------+-------+-------+------+--------+-------+-------+
|459651| 2016|     4|   135|       135|    ATL|      1|     FL|    54|      2|   1962|  null|      WT|  20547|  20559|
|459652| 2016|     4|   135|       135|    ATL|      1|     FL|    74|      2|   1942|     F|      WT|  20547|  20555|
|459653| 2016|     4|   135|       135|    ATL|      1|     FL|    44|      2|   1972|     M|      B2|  20547|  20557|
|459654| 2016|     4|   135|       135|    ATL|      1|      G|    38|      2|   1978|  null|      WT|  20547|  20555|
|459655| 2016|     4|   135|       135|    ATL|      1|     GA|    64|      2|   1952|     F|      WT|  20547|   null|
+------+-----+------+------+----------+-------+-

In [23]:
immigration_fmt_df.count()

879748

In [24]:
immigration_ctry_df = immigration_fmt_df.join(country_lookup_df, "i94country", 'left_outer') \
                                        .drop("i94country") \
                                        .withColumnRenamed("country", "res_cnty")
immigration_ctry_df.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94port: string (nullable = true)
 |-- i94mode: integer (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- i94visa: integer (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- arrdate: integer (nullable = true)
 |-- depdate: integer (nullable = true)
 |-- res_cnty: string (nullable = true)



In [25]:
immigration_ctry_df.count()

879748

In [26]:
immigration_ctry_df.show(5)

+------+-----+------+------+-------+-------+-------+------+-------+-------+------+--------+-------+-------+--------------+
| cicid|i94yr|i94mon|i94cit|i94port|i94mode|i94addr|i94bir|i94visa|biryear|gender|visatype|arrdate|depdate|      res_cnty|
+------+-----+------+------+-------+-------+-------+------+-------+-------+------+--------+-------+-------+--------------+
|459651| 2016|     4|   135|    ATL|      1|     FL|    54|      2|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM|
|459652| 2016|     4|   135|    ATL|      1|     FL|    74|      2|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM|
|459653| 2016|     4|   135|    ATL|      1|     FL|    44|      2|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM|
|459654| 2016|     4|   135|    ATL|      1|      G|    38|      2|   1978|  null|      WT|  20547|  20555|UNITED KINGDOM|
|459655| 2016|     4|   135|    ATL|      1|     GA|    64|      2|   1952|     F|      WT|  20547|   null|UNITED KINGDOM|
+------+-----+--

In [27]:
immigration_visa_df = immigration_ctry_df.join(visa_lookup_df, 'i94visa', 'left_outer').drop("i94visa")

In [28]:
immigration_visa_df.show(5)

+------+-----+------+------+-------+-------+-------+------+-------+------+--------+-------+-------+--------------+---------+
| cicid|i94yr|i94mon|i94cit|i94port|i94mode|i94addr|i94bir|biryear|gender|visatype|arrdate|depdate|      res_cnty|     visa|
+------+-----+------+------+-------+-------+-------+------+-------+------+--------+-------+-------+--------------+---------+
|459651| 2016|     4|   135|    ATL|      1|     FL|    54|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM| Pleasure|
|459652| 2016|     4|   135|    ATL|      1|     FL|    74|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM| Pleasure|
|459653| 2016|     4|   135|    ATL|      1|     FL|    44|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM| Pleasure|
|459654| 2016|     4|   135|    ATL|      1|      G|    38|   1978|  null|      WT|  20547|  20555|UNITED KINGDOM| Pleasure|
|459655| 2016|     4|   135|    ATL|      1|     GA|    64|   1952|     F|      WT|  20547|   null|UNITED KINGDOM| Pleasure|


In [29]:
immigration_mod_df = immigration_visa_df.join(mode_lookup_df, 'i94mode', 'left_outer').drop("i94mode")

In [30]:
immigration_mod_df.show(5)

+------+-----+------+------+-------+-------+------+-------+------+--------+-------+-------+--------------+---------+----+
| cicid|i94yr|i94mon|i94cit|i94port|i94addr|i94bir|biryear|gender|visatype|arrdate|depdate|      res_cnty|     visa|mode|
+------+-----+------+------+-------+-------+------+-------+------+--------+-------+-------+--------------+---------+----+
|459651| 2016|     4|   135|    ATL|     FL|    54|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM| Pleasure| Air|
|459652| 2016|     4|   135|    ATL|     FL|    74|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|
|459653| 2016|     4|   135|    ATL|     FL|    44|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM| Pleasure| Air|
|459654| 2016|     4|   135|    ATL|      G|    38|   1978|  null|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|
|459655| 2016|     4|   135|    ATL|     GA|    64|   1952|     F|      WT|  20547|   null|UNITED KINGDOM| Pleasure| Air|
+------+-----+------+---

In [31]:
immigration_addr_df = immigration_mod_df.join(addr_lookup_df, 'i94addr', 'left_outer').drop("i94addr")

In [32]:
immigration_addr_df.show(5)

+------+-----+------+------+-------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+
| cicid|i94yr|i94mon|i94cit|i94port|i94bir|biryear|gender|visatype|arrdate|depdate|      res_cnty|     visa|mode|  state|
+------+-----+------+------+-------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+
|459651| 2016|     4|   135|    ATL|    54|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM| Pleasure| Air|FLORIDA|
|459652| 2016|     4|   135|    ATL|    74|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|FLORIDA|
|459653| 2016|     4|   135|    ATL|    44|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM| Pleasure| Air|FLORIDA|
|459654| 2016|     4|   135|    ATL|    38|   1978|  null|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|   null|
|459655| 2016|     4|   135|    ATL|    64|   1952|     F|      WT|  20547|   null|UNITED KINGDOM| Pleasure| Air|GEORGIA|
+------+-----+------+---

In [33]:
immigration_addr_df.count()

879748

In [34]:
# Add i94port city and state columns to immegration dataframe
immigration_port_df=immigration_addr_df.join(port_lookup_df, 'i94port', 'left') \
                                       .drop("i94port","addr") \
                                       .withColumnRenamed("country", "port_cnty")
immigration_port_df.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- arrdate: integer (nullable = true)
 |-- depdate: integer (nullable = true)
 |-- res_cnty: string (nullable = true)
 |-- visa: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- state: string (nullable = true)
 |-- portname: string (nullable = true)
 |-- port_cnty: string (nullable = true)



In [35]:
immigration_port_df.count()

879748

In [36]:
immigration_port_df.show(5)

+------+-----+------+------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+--------+-------------+
| cicid|i94yr|i94mon|i94cit|i94bir|biryear|gender|visatype|arrdate|depdate|      res_cnty|     visa|mode|  state|portname|    port_cnty|
+------+-----+------+------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+--------+-------------+
|459651| 2016|     4|   135|    54|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|
|459652| 2016|     4|   135|    74|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|
|459653| 2016|     4|   135|    44|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|
|459654| 2016|     4|   135|    38|   1978|  null|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|   null| ATLANTA|UNITED STATES|
|459655| 2016|     4|   135|    64|   195

In [37]:
get_date = udf(lambda x: (dt.datetime(1960, 1, 1).date() + dt.timedelta(x)).isoformat() if x else None)
immigrant_date_df = immigration_port_df.withColumn("arrival", get_date(immigration_port_df.arrdate)) \
                                       .withColumn("departure", get_date(immigration_port_df.depdate)) 

In [38]:
immigrant_date_df.count()

879748

In [39]:
immigrant_date_df.write.mode("overwrite").partitionBy("i94yr", "i94mon").parquet('output/immigrant_fact_table')

In [40]:
immigrant_date_df.select("arrdate","depdate","arrival","departure").show(5)

+-------+-------+----------+----------+
|arrdate|depdate|   arrival| departure|
+-------+-------+----------+----------+
|  20547|  20559|2016-04-03|2016-04-15|
|  20547|  20555|2016-04-03|2016-04-11|
|  20547|  20557|2016-04-03|2016-04-13|
|  20547|  20555|2016-04-03|2016-04-11|
|  20547|   null|2016-04-03|      null|
+-------+-------+----------+----------+
only showing top 5 rows



In [41]:
arrival_date = immigrant_date_df.select(col('arrdate').alias('arrival_sasdate'),
                                       col('arrival').alias('arrival_date'),
                                       date_format('arrival','M').alias('arrival_month'),
                                       date_format('arrival','E').alias('arrival_dayofweek'), 
                                       date_format('arrival', 'y').alias('arrival_year'), 
                                       date_format('arrival', 'd').alias('arrival_day'),
                                      date_format('arrival','w').alias('arrival_weekofyear')).dropDuplicates()

In [42]:
arrival_date.show(5)

+---------------+------------+-------------+-----------------+------------+-----------+------------------+
|arrival_sasdate|arrival_date|arrival_month|arrival_dayofweek|arrival_year|arrival_day|arrival_weekofyear|
+---------------+------------+-------------+-----------------+------------+-----------+------------------+
|          20553|  2016-04-09|            4|              Sat|        2016|          9|                15|
|          20550|  2016-04-06|            4|              Wed|        2016|          6|                15|
|          20554|  2016-04-10|            4|              Sun|        2016|         10|                16|
|          20546|  2016-04-02|            4|              Sat|        2016|          2|                14|
|          20551|  2016-04-07|            4|              Thu|        2016|          7|                15|
+---------------+------------+-------------+-----------------+------------+-----------+------------------+
only showing top 5 rows



In [43]:
arrival_date.createOrReplaceTempView("arrival_date")
arrival_season = spark.sql('''select cast(arrival_sasdate as int),
                             arrival_date,
                             arrival_month,
                             arrival_dayofweek,
                             arrival_year,
                             arrival_day,
                             arrival_weekofyear,
                             CASE WHEN arrival_month IN (12, 1, 2) THEN 'winter' 
                                    WHEN arrival_month IN (3, 4, 5) THEN 'spring' 
                                    WHEN arrival_month IN (6, 7, 8) THEN 'summer' 
                                    ELSE 'autumn' 
                             END AS date_season from arrival_date''')

In [44]:
arrival_season.write.mode("overwrite").partitionBy("arrival_year", "arrival_month").parquet('output/arrival_season')

In [45]:
arrival_output_df = spark.read.parquet("output/arrival_season")

In [46]:
arrival_output_df.show(5)

+---------------+------------+-----------------+-----------+------------------+-----------+------------+-------------+
|arrival_sasdate|arrival_date|arrival_dayofweek|arrival_day|arrival_weekofyear|date_season|arrival_year|arrival_month|
+---------------+------------+-----------------+-----------+------------------+-----------+------------+-------------+
|          20554|  2016-04-10|              Sun|         10|                16|     spring|        2016|            4|
|          20573|  2016-04-29|              Fri|         29|                18|     spring|        2016|            4|
|          20551|  2016-04-07|              Thu|          7|                15|     spring|        2016|            4|
|          20549|  2016-04-05|              Tue|          5|                15|     spring|        2016|            4|
|          20547|  2016-04-03|              Sun|          3|                15|     spring|        2016|            4|
+---------------+------------+-----------------+

In [47]:
immigrant_fact_df = spark.read.parquet("output/immigrant_fact_table")

In [48]:
immigrant_fact_df.show(5)

+------+------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+--------+-------------+----------+----------+-----+------+
| cicid|i94cit|i94bir|biryear|gender|visatype|arrdate|depdate|      res_cnty|     visa|mode|  state|portname|    port_cnty|   arrival| departure|i94yr|i94mon|
+------+------+------+-------+------+--------+-------+-------+--------------+---------+----+-------+--------+-------------+----------+----------+-----+------+
|459651|   135|    54|   1962|  null|      WT|  20547|  20559|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|2016-04-03|2016-04-15| 2016|     4|
|459652|   135|    74|   1942|     F|      WT|  20547|  20555|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|2016-04-03|2016-04-11| 2016|     4|
|459653|   135|    44|   1972|     M|      B2|  20547|  20557|UNITED KINGDOM| Pleasure| Air|FLORIDA| ATLANTA|UNITED STATES|2016-04-03|2016-04-13| 2016|     4|
|459654|   135|    38|   1978|  null|      WT|

In [49]:
cities_out_df = spark.read.parquet("output/us_cities_demographics")

In [50]:
cities_out_df.show(5)

+-----------+----------+-----------+----------+---------------+-----------------+----------------+------------+------------------+-----------------+----------------+---------------------------+-------------------+----------------+
|       City|State_Code|      State|Median_Age|Male_Population|Female_Population|Total_Population|Foreign_born|Avg_Household_Size|Native_Population|Asian_Population|African_American_Population|Hispanic_Population|White_population|
+-----------+----------+-----------+----------+---------------+-----------------+----------------+------------+------------------+-----------------+----------------+---------------------------+-------------------+----------------+
|     Newark|        NJ| New Jersey|      34.6|         138040|           143873|          281913|       86253|              2.73|             2268|            7349|                     144961|             100432|           76402|
|Gainesville|        FL|    Florida|      26.0|          60803|            6

# Data Quality checks

Checking all cities details are loaded into fact table

In [52]:
cities_df.select("City").distinct().count()

567

In [55]:
cities_out_df.select("city").distinct().count()

567

In [53]:
immigration_df.count()

879748

In [54]:
immigrant_fact_df.count()

879748