In [37]:
import os
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from itertools import chain
from pyspark.sql.functions import udf

In [38]:
DATAFOLDER = '/Users/christian/Data/udacity_capstone/'
OUTPUTFOLDER = '/Users/christian/Data/udacity_capstone/output'

In [39]:
def get_immi_schema():
    """
    Map the column names to datatypes and return as a schema.
    """
    immi_schema = T.StructType([
        T.StructField('cicid', T.IntegerType()),
        T.StructField('i94yr', T.StringType()),
        T.StructField('i94mon', T.StringType()),
        T.StructField('i94cit', T.StringType()),
        T.StructField('i94res', T.IntegerType()),
        T.StructField('i94port', T.StringType()),
        T.StructField('arrdate', T.StringType()),
        T.StructField('i94mode', T.StringType()),
        T.StructField('i94addr', T.StringType()),
        T.StructField('depdate', T.StringType()),
        T.StructField('i94bir', T.StringType()),
        T.StructField('i94visa', T.StringType()),
        T.StructField('count',  T.IntegerType()),
        T.StructField('dtadfile', T.StringType()),
        T.StructField('visapost', T.IntegerType()),
        T.StructField('occup', T.StringType()),
        T.StructField('entdepa', T.StringType()),
        T.StructField('entdepd', T.IntegerType()),
        T.StructField('entdepu', T.StringType()),
        T.StructField('matflag', T.StringType()),
        T.StructField('biryear', T.StringType()),
        T.StructField('dtaddto', T.StringType()),
        T.StructField('gender', T.StringType()),
        T.StructField('insnum', T.StringType()),
        T.StructField('airline', T.StringType()),
        T.StructField('admnum', T.StringType()),
        T.StructField('fltno', T.StringType()),
        T.StructField('visatype', T.StringType()),
    ])
    return immi_schema


In [40]:
schema = get_immi_schema()


In [41]:

spark = SparkSession.builder.\
config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
.enableHiveSupport().getOrCreate()
#df_spark = spark.read.format('com.github.saurfang.sas.spark').load('../../data/18-83510-I94-Data-2016/i94_apr16_sub.sas7bdat')
df_spark = spark.read.option("mergeSchema", "true").parquet(os.path.join(DATAFOLDER, 'sas_data'))

In [76]:
count_missings(df_spark)

Unnamed: 0,count
depdate,103422
i94mode,102
i94bir,19
biryear,19
cicid,0
i94yr,0
i94mon,0
i94cit,0
i94res,0
arrdate,0


In [42]:
df_spark.show()

+--------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|   cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|         admnum|fltno|visatype|
+--------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|459651.0|2016.0|   4.0| 135.0| 135.0|    ATL|20547.0|    1.0|     FL|20559.0|  54.0|    2.0|  1.0|20160403|    null| null|      O|      R|   null|      M| 1962.0|07012016|  null|  null|     VS|5.5556253633E10|00115|      WT|
|459652.0|2016.0|   4.0| 135.0| 135.0|    ATL|20547.0|    1.0|     FL|20555.0|  74.0|    2.0|  1

# Check uniqueness of admission number

In [10]:
df_spark.count()

2641028

In [26]:
df_spark.select('cicid').distinct().count()

2641028

In [12]:
df_spark.drop_duplicates().count()

2641028

In [13]:
df_spark = df_spark.drop_duplicates()

In [15]:
df_spark.count()

2641028

In [14]:
df_spark.count() == df_spark.select('admnum').distinct().count()

False

In [16]:
df_spark.select('admnum').distinct().count()

2637526

In [23]:
duplicate_admnum = df_spark.groupBy('admnum').count().filter('count > 2').orderBy(F.desc('count'))

In [24]:
duplicate_admnum.first()['admnum']

89977239030.0

In [25]:
df_spark.filter('admnum == 89977239030').show()

+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
|    cicid| i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|
+---------+------+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+
| 155049.0|2016.0|   4.0| 582.0| 582.0|    BRO|20545.0|    1.0|     TX|20552.0|  32.0|    1.0|  1.0|20160401|     MER| null|      H|      R|   null|      M| 1984.0|09062016|     M|  null|    *GA|8.997723903E10|N934C|      B1|
|3241942.0|2016.0|   4.0| 582.0| 582.0|    BRO|20561.0|    1.0|     TX|20567.0|  32.0|    1.0|  

# convert data types

In [None]:
df_spark = df_spark.withColumn('i94yr', df_spark['i94yr'].cast(T.IntegerType())).\
        withColumn('i94mon', df_spark['i94mon'].cast(T.IntegerType())).\
        withColumn('i94cit', df_spark['i94cit'].cast(T.IntegerType())).\
        withColumn('i94res', df_spark['i94res'].cast(T.IntegerType())).\
        withColumn('arrdate', df_spark['arrdate'].cast(T.IntegerType())).\
        withColumn('i94mode', df_spark['i94mode'].cast(T.IntegerType())).\
        withColumn('depdate', df_spark['depdate'].cast(T.IntegerType())).\
        withColumn('i94bir', df_spark['i94bir'].cast(T.IntegerType())).\
        withColumn('i94visa', df_spark['i94visa'].cast(T.IntegerType())).\
        withColumn('count', df_spark['count'].cast(T.IntegerType())).\
        withColumn('biryear', df_spark['biryear'].cast(T.IntegerType())).\
        withColumn('admnum', df_spark['admnum'].cast(T.IntegerType()))

In [None]:
df_sample = pd.read_csv(os.path.join(DATAFOLDER, 'immigration_data_sample.csv'))

In [None]:
df_spark.head()

In [None]:
df_time = df_spark
df_time = df_time.withColumn("arrival_date", F.expr("date_add(to_date('1960-01-01'), arrdate)"))
df_time = df_time.withColumn("depart_date", F.expr("date_add(to_date('1960-01-01'), depdate)"))


In [None]:
df_time.withColumn("diff_days", F.datediff("depart_date", "arrival_date")).show()

## map i94addrl - state

In [None]:

def map_col(df, map_col_name, df_col_name, new_col_name):
    """
    
    Parameters
    ----------
    df : spark dataframe
        The file containing the df_col_name to be used for mapping.
    map_col_name : str
        The column name of the mapping file.
    df_col_name : str
        The column name in the Spark dataframe to be used.
    new_col_name : str
        New column name of the mapping results.
    
    
    """
    df_map = pd.read_csv(os.path.join(DATAFOLDER, f'{map_col_name}.csv'), quotechar="'")
    id_col = f'{map_col_name}_id'
    dic_map = dict(zip(df_map[id_col], df_map[map_col_name]))
    mapping_expr = F.create_map([F.lit(x) for x in chain(*dic_map.items())])
    return df.withColumn(new_col_name, mapping_expr[F.col(df_col_name)])



@udf
def udf_city_name(city_full):
    splt = str.split(city_full, ',')
    if len(splt) == 0:
        return ''
    return splt[0].capitalize().strip()

@udf
def udf_state_short(city_full):
    splt = str.split(city_full, ',')
    if len(splt) < 2:
        return ''
    return splt[1].strip()

@udf
def udf_state_format(port_state):
    return port_state.capitalize()

In [None]:
df_test.select("i94addr").distinct().show(10)

# TODO replace mapping in i94addrl to have names in captialize!

In [None]:
#df, map_col_name, df_col_name, new_col_name
df_test = df_time
#df_test = map_col(df_test, 'i94prtl', 'i94port', 'port')

# city name of arrival port
#df_test = df_test.withColumn("port_city", udf_city_name("port"))
df_test = map_col(df_test, 'prtl_city', 'i94port', 'port_city')
#df_test = df_test.withColumn("port_state_short", udf_state_short("port"))
df_test = map_col(df_test, 'prtl_state', 'i94port', 'port_state_short')
df_test = map_col(df_test, 'addrl', 'port_state_short', 'port_state')
#df_test = df_test.withColumn('port_state_2', udf_state_format('port_state'))

# country of entry, country of citizenship
df_test = map_col(df_test, 'cntyl', 'i94port', 'state_cit')
df_test = map_col(df_test, 'cntyl', 'i94port', 'state_res')
#df_test = map_col(df_test, 'i94addrl', 'i94addr', 'port_state')
df_test = map_col(df_test, 'visa', 'i94visa', 'visa')
#df_test = df_test.withColumn("port_state", udf_state_format("port_state"))

Is arrdate ever null?

In [None]:
df_test.where(F.col("arrdate").isNull()).show()

# merge with demographic data

In [None]:
#demographic = pd.read_csv(os.path.join(DATAFOLDER, 'us-cities-demographics.csv'), sep=';')

In [None]:
demographics = spark.read.format('csv').options(header='true', sep=';', inferSchema=True).\
    load(os.path.join(DATAFOLDER, 'us-cities-demographics.csv'))

match airport with demographic table

In [None]:
# look up unique combinations of city and state in fact table
# 

In [None]:
# subset with unique combinations of city, state and 
# port_city, port_state, i94port (as key )

In [None]:
df_test.show(5)

In [None]:
def spark_read_csv(spark, folder, filename, **kwargs):

    return spark.read.format('csv').options(header='true', inferSchema=True, **kwargs).\
        load(os.path.join(folder, filename))

def format_column_names(s):
    s = s.casefold()
    s = s.replace(' ', '_')
    s = s.replace('-', '_')
    return s

def rename_columns(df):
    old_names = df.schema.names
    new_names = [format_column_names(s) for s in old_names]
    df = reduce(lambda df, idx: 
                df.withColumnRenamed(old_names[idx], new_names[idx]), range(len(old_names)), df)
    return df

def create_demographic_table(spark, datafolder, outputfolder): 
    # select a subset of original table
    #df_demo = df.select(['i94port']).dropDuplicates()
    df_demo = spark_read_csv(spark, datafolder, 'prtl_city.csv')
    #pd.read_csv(os.path.join(datafolder, 'prtl.csv'), quotechar="'")
    demographics = spark_read_csv(spark, datafolder, 'us-cities-demographics.csv', sep=';')
    #pd.read_csv(os.path.join(datafolder, 'us-cities-demographics.csv'), quotechar="'")
    
    # do the preprocessing, append columns
    #df_demo = map_col(df_demo, 'prtl_city', 'prtl_city', 'port_city')
    df_demo = df_demo.withColumnRenamed('prtl_city', 'port_city')
    #df_test = df_test.withColumn("port_state_short", udf_state_short("port"))
    df_demo = map_col(df_demo, 'prtl_state', 'prtl_city_id', 'port_state_short')
    df_demo = map_col(df_demo, 'addrl', 'port_state_short', 'port_state')
    
    # we are deleting the ones, for which no states were found, since we only focus on the US here
    # later expand world wide
    df_demo = df_demo.dropna(subset=['port_state'])
    df_demo = df_demo.join(demographics, (df_demo.port_city == demographics.City) & (df_demo.port_state_short == demographics['State Code']))
    columns_to_drop = ['port_city', 'port_state', 'port_state_short']
    df_demo = df_demo.drop(*columns_to_drop)
    
    # reformat column names
    df_demo = rename_columns(df_demo)
    df_demo.write.parquet(os.path.join(outputfolder, 'city_demographics.parquet'), 'overwrite')
    return df_demo

In [None]:
df_demo = create_demographic_table(spark, DATAFOLDER, OUTPUTFOLDER)

In [None]:
df_demo.show()

# create single tables for everything

## country table

In [None]:
def spark_read_csv(folder, filename, **kwargs):

    return spark.read.format('csv').options(header='true', inferSchema=True, **kwargs).\
        load(os.path.join(folder, filename))

def csv_to_parquet(datafolder, outputfolder, csv_name, table_name): 
    df = spark_read_csv(datafolder, f'{csv_name}.csv', sep=',', quotechar="'")
    df = df.withColumnRenamed('value', 'id')
    df.write.parquet(os.path.join(outputfolder, f'{table_name}.parquet'), 'overwrite')
    return df

In [None]:
cntyl = csv_to_parquet(DATAFOLDER, OUTPUTFOLDER, 'i94cntyl', 'country')

In [None]:
map_col_name = 'prtl_state'
df_map = spark_read_csv(spark, datafolder, f'{map_col_name}.csv')
#pd.read_csv(os.path.join(DATAFOLDER, f'{map_col_name}.csv'), quotechar="'")
df_map = df_map.toPandas()
id_col = f'{map_col_name}_id'
dic_map = dict(zip(df_map[id_col], df_map[map_col_name]))
mapping_expr = F.create_map([F.lit(x) for x in chain(*dic_map.items())])

In [None]:
list(df_map[id_col])

# model table i94model

In [None]:
transport = csv_to_parquet(DATAFOLDER, OUTPUTFOLDER, 'i94model', 'transport')

In [None]:
transport.show()

# visa table i94visa

In [None]:
visa = csv_to_parquet(DATAFOLDER, OUTPUTFOLDER, 'i94visa', 'visa')

In [None]:
visa.show()

## global annual

In [None]:
#TODO
# also aggregate by year
folder = os.path.join(DATAFOLDER, 'climate-change-earth-surface-temperature-data')
global_annual = csv_to_parquet(folder, OUTPUTFOLDER, 'GlobalTemperatures', 'visa')

# Time Table

In [None]:
def create_full_time_table(outputfolder, daysafter=36525):

    days_till_2060 = range(daysafter)
    all_dates = [(t,) for t in days_till_2060]
    
    t_schema = T.StructType([T.StructField('i94_date', T.IntegerType())])
    timeframe = spark.createDataFrame(all_dates, t_schema)
    timeframe = timeframe.withColumn("dt_date", F.expr("date_add(to_date('1960-01-01'), i94_date)"))
    timeframe = timeframe.select('i94_date', 'dt_date',
                    F.year('dt_date').alias('year'),
                    F.month('dt_date').alias('month'),
                    F.dayofmonth('dt_date').alias('day'),
                    F.dayofweek('dt_date').alias('weekday'))
    
    timeframe.write.partitionBy('year', 'month').parquet(os.path.join(outputfolder, 'dates.parquet'), 'overwrite')
    return timeframe

In [None]:
timeframe = create_full_time_table(OUTPUTFOLDER)

In [None]:
timeframe.dtypes

In [None]:
timeframe.show()

In [None]:
def create_full_time_table(outputfolder, daysafter=36525):

    future_days = range(daysafter)
    all_dates = [(t,) for t in future_days]
    
    t_schema = T.StructType([T.StructField('i_date', T.IntegerType())])
    timeframe = spark.createDataFrame(all_dates, t_schema)
    timeframe = timeframe.withColumn("dt_date", F.expr("date_add(to_date('1960-01-01'), i_date)"))
    timeframe = timeframe.select('i_date', 'dt_date',
                    F.year('dt_date').alias('year'),
                    F.month('dt_date').alias('month'),
                    F.dayofmonth('dt_date').alias('day'),
                    F.dayofweek('dt_date').alias('weekday'))
    
    timeframe.write.partitionBy('year', 'month').parquet(os.path.join(outputfolder, 'dates.parquet'), 'overwrite')
    return timeframe

In [None]:
tf = create_full_time_table(OUTPUTFOLDER)

In [None]:
tf.head()

# Climate data

In [None]:
# TODO apply rolling window function on average of last 10 years

In [None]:
folder = os.path.join(DATAFOLDER, 'climate-change-earth-surface-temperature-data')
filename = 'GlobalLandTemperaturesByCountry.csv'

In [None]:
climate = spark_read_csv(folder, filename, sep=',')

## convert timestamp to date
already done

In [None]:
climate.dtypes

In [None]:
climate = climate.withColumn('dt', F.to_date(F.col('dt')))

merge with country id from country table

In [None]:
climate.agg({"dt": "min"}).collect()[0][0]

In [None]:
climate.agg({"dt": "max"}).collect()[0][0]

In [None]:
country = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'country.parquet'))

In [None]:
climate = climate.join(country, on=country['i94cntyl'] == climate['Country'], how='leftouter')

In [None]:
climate = climate.drop('i94cntyl')

In [None]:
def generate_climate_country(folder, filename, outputfolder):
    climate = spark_read_csv(folder, filename, sep=',')
    climate = climate.withColumn('dt', F.to_date(F.col('dt')))
    country = spark.read.option("mergeSchema", "true").parquet(os.path.join(outputfolder, 'country.parquet'))
    climate = climate.join(country, on=country['i94cntyl'] == climate['Country'], how='leftouter')
    climate = climate.drop('i94cntyl')
    climate = rename_columns(climate)
    climate = climate.withColumn('year', F.year('dt').alias('year'))
    climate = climate.withColumnRenamed('id', 'country_id').\
        withColumnRenamed('averagetemperatureuncertainty', 'avg_uncertainty').\
        withColumnRenamed('averagetemperature', 'avg_temperature')
    climate.write.partitionBy('year', 'country').parquet(os.path.join(outputfolder, 'climate_country.parquet'), 'overwrite')
    return climate

In [None]:
folder = os.path.join(DATAFOLDER, 'climate-change-earth-surface-temperature-data')
filename = 'GlobalLandTemperaturesByCountry.csv'
climate_country = generate_climate_country(folder, filename, OUTPUTFOLDER)

In [None]:
climate_country = climate_country.withColumnRenamed('id', 'country_id').\
    withColumnRenamed('averagetemperatureuncertainty', 'avg_uncertainty')

## aggregate annual average temperature

In [None]:
annual = climate.withColumn('year', F.year('dt').alias('year'))

In [None]:
climate_country.groupby([F.col('country'), F.col('country_id'), F.col('year')]).agg(F.avg('averagetemperature').alias('avg_temperature')).show()

In [None]:
annual = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'climate_country.parquet'))

In [None]:
def create_annual_temp_table(outputfolder):
    annual = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'climate_country.parquet'))
    annual.groupby([F.col('country'), F.col('country_id'), F.col('year')]).agg(F.avg('avg_temperature').alias('avg_temperature'))
    annual.write.partitionBy('country').parquet(os.path.join(outputfolder, 'annual_climate_country.parquet'), 'overwrite')
    return annual

In [None]:
annual = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'climate_country.parquet'))
annual.groupby([F.col('country'), F.col('country_id'), F.col('year')]).agg(F.avg('averagetemperature').alias('avg_temperature'))

In [None]:
#todo when storing partition by country and year

# Global Temperatures

# Asylum Report

In [None]:
# https://www.kaggle.com/dhs/refugee-report

# Fact Table

- Calculate length of stay

In [None]:
df_test.head()

In [None]:
timeframe.filter("i94_date = 20573").select('dt_date').show()

In [None]:
df_fact = df_test.withColumn("arrival_dt", F.expr("date_add(to_date('1960-01-01'), arrdate)"))
df_fact = df_fact.withColumn("depart_dt", F.expr("date_add(to_date('1960-01-01'), depdate)"))

In [None]:
keep_columns = ['cicid', 'i94yr', 'i94mon', 'arrival_dt', 'arrdate', 'depdate', 'i94cit', 'i94res', 'i94port', 'i94mode', 'i94addr', 'i94bir',
               'i94visa', 'visatype', 'biryear', 'gender', 'airline', 'fltno', 'length_stay']

#maybe = ['entdepa', 'entdepd', 'matflag']

In [None]:
df_fact = df_fact.withColumn("length_stay", F.datediff("depart_date", "arrival_date"))


In [None]:
df_fact.select(keep_columns).show()

In [None]:
df_fact.dtypes

In [None]:
def process_facts_table()

In [None]:
# df_test.join(timeframe, on=df_test['arrdate']==timeframe['i94_date'], how='leftouter').show() 
# too complicated
#df_test.join(timeframe, on=df_test['arrdate']==timeframe['i94_date'], how='left').dropDuplicates().select(timeframe['dt_date']).show()

In [None]:
df_test.count()

In [None]:
df_test.withColumn("length_stay", F.datediff("depart_date", "arrival_date")).show()


# TODO

- copy my csv files to S3 manually
with a DAG
- write some csvs to redshift 
- copy the stuff to redshift
- preprocessed tables as parquet files?
- maybe split it up into two DAGs
-- one does the intialization of the dimension tables, which won't change that often
-- the other one does the updating of the fact table

- add IATA code to city ?

- or maybe the nodes can be confiured to only run once?


1. csv von S3 -> redshift
2. parquet von S3 -> Spark -> redshift, maybe like this

https://sonra.io/2018/01/01/using-apache-airflow-to-build-a-data-pipeline-on-aws/

In [None]:
COPY listing
FROM 's3://mybucket/data/listings/parquet/'
IAM_ROLE 'arn:aws:iam::0123456789012:role/MyRedshiftRole'
FORMAT AS PARQUET;

If nothing works use Docker
https://towardsdatascience.com/getting-started-with-airflow-using-docker-cd8b44dbff98

Additional databases
- https://www.kaggle.com/open-flights/flight-route-database
- https://www.kaggle.com/dhs/refugee-report
- https://www.dhs.gov/immigration-statistics

In [None]:
OUTPUTFOLDER

In [74]:
import pyspark.sql.functions as F
def count_missings(spark_df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

# Reviewing all the tables

In [43]:
df_country = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'country.parquet')) 

In [44]:
df_country.columns

['cntyl_id', 'cntyl']

In [45]:
df_country.show()

+--------+--------------------+
|cntyl_id|               cntyl|
+--------+--------------------+
|     582|Mexico Air Sea, A...|
|     236|         Afghanistan|
|     101|             Albania|
|     316|             Algeria|
|     102|             Andorra|
|     324|              Angola|
|     529|            Anguilla|
|     518|     Antigua-barbuda|
|     687|          Argentina |
|     151|             Armenia|
|     532|               Aruba|
|     438|           Australia|
|     103|             Austria|
|     152|          Azerbaijan|
|     512|             Bahamas|
|     298|             Bahrain|
|     274|          Bangladesh|
|     513|            Barbados|
|     104|             Belgium|
|     581|              Belize|
+--------+--------------------+
only showing top 20 rows



## temperature_country

In [46]:
df_temp_country = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'temperature_country.parquet')) 

In [47]:
df_temp_country.columns

['dt',
 'average_temperature',
 'average_temperature_uncertainty',
 'cntyl_id',
 'year',
 'country']

In [48]:
df_temp_country.printSchema()

root
 |-- dt: date (nullable = true)
 |-- average_temperature: double (nullable = true)
 |-- average_temperature_uncertainty: double (nullable = true)
 |-- cntyl_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- country: string (nullable = true)



In [49]:
df_temp_country.show()

+----------+-------------------+-------------------------------+--------+----+---------+
|        dt|average_temperature|average_temperature_uncertainty|cntyl_id|year|  country|
+----------+-------------------+-------------------------------+--------+----+---------+
|1967-01-01|             25.488|                          0.227|     386|1967|    Benin|
|1967-02-01|             28.826|                          0.309|     386|1967|    Benin|
|1967-03-01|             29.586|                          0.304|     386|1967|    Benin|
|1967-04-01|             29.474|                          0.271|     386|1967|    Benin|
|1967-05-01|             28.676|                          0.349|     386|1967|    Benin|
|1967-06-01|               26.5|                          0.309|     386|1967|    Benin|
|1967-07-01|             25.449|                          0.395|     386|1967|    Benin|
|1967-08-01|              24.73|                          0.299|     386|1967|    Benin|
|1967-09-01|         

## temperature_annual_country

In [50]:
df_temp_country = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'temperature_annual_country.parquet')) 

In [51]:
df_temp_country.columns

['cntyl_id', 'year', 'average_temperature', 'country']

In [52]:
df_temp_country.printSchema()

root
 |-- cntyl_id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- average_temperature: double (nullable = true)
 |-- country: string (nullable = true)



In [53]:
df_temp_country.show()

+--------+----+-------------------+-------+
|cntyl_id|year|average_temperature|country|
+--------+----+-------------------+-------+
|     109|1819|              5.059|Estonia|
|     109|1921|              5.187|Estonia|
|     109|2011|              6.865|Estonia|
|     109|1764|              5.116|Estonia|
|     109|1984|              5.747|Estonia|
|     109|1830|              4.195|Estonia|
|     109|1750|              5.802|Estonia|
|     116|1837|              8.951|Ireland|
|     116|1820|              8.829|Ireland|
|     116|1984|              9.785|Ireland|
|     116|1976|              9.862|Ireland|
|     116|1927|              9.415|Ireland|
|     116|1935|               9.67|Ireland|
|     116|1772|              9.038|Ireland|
|     165|1893|             10.541|Croatia|
|     165|1952|             11.888|Croatia|
|     165|1899|             11.196|Croatia|
|     165|1770|             11.085|Croatia|
|     165|1786|             10.581|Croatia|
|     165|1907|             11.0

In [79]:
df_transport = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'transport.parquet')) 

In [80]:
df_transport.columns

['model_id', 'model']

In [81]:
df_transport.show()

+--------+------------+
|model_id|       model|
+--------+------------+
|       1|         Air|
|       2|         Sea|
|       3|        Land|
|       9|Not reported|
+--------+------------+



In [82]:
df_time = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'dates.parquet')) 

In [83]:
df_time.columns

['i_date', 'dt_date', 'day', 'weekday', 'year', 'month']

In [84]:
df_time.show()

+------+----------+---+-------+----+-----+
|i_date|   dt_date|day|weekday|year|month|
+------+----------+---+-------+----+-----+
|  9709|1986-08-01|  1|      6|1986|    8|
|  9710|1986-08-02|  2|      7|1986|    8|
|  9711|1986-08-03|  3|      1|1986|    8|
|  9712|1986-08-04|  4|      2|1986|    8|
|  9713|1986-08-05|  5|      3|1986|    8|
|  9714|1986-08-06|  6|      4|1986|    8|
|  9715|1986-08-07|  7|      5|1986|    8|
|  9716|1986-08-08|  8|      6|1986|    8|
|  9717|1986-08-09|  9|      7|1986|    8|
|  9718|1986-08-10| 10|      1|1986|    8|
|  9719|1986-08-11| 11|      2|1986|    8|
|  9720|1986-08-12| 12|      3|1986|    8|
|  9721|1986-08-13| 13|      4|1986|    8|
|  9722|1986-08-14| 14|      5|1986|    8|
|  9723|1986-08-15| 15|      6|1986|    8|
|  9724|1986-08-16| 16|      7|1986|    8|
|  9725|1986-08-17| 17|      1|1986|    8|
|  9726|1986-08-18| 18|      2|1986|    8|
|  9727|1986-08-19| 19|      3|1986|    8|
|  9728|1986-08-20| 20|      4|1986|    8|
+------+---

## city demographics

In [58]:
city_demographics = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'city_demographics.parquet')) 

In [85]:
city_demographics.columns

['prtl_city_id',
 'city',
 'median_age',
 'male_population',
 'female_population',
 'total_population',
 'foreignborn',
 'average_household_size',
 'state_code',
 'state']

In [59]:
city_demographics.printSchema()

root
 |-- prtl_city_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- median_age: double (nullable = true)
 |-- male_population: integer (nullable = true)
 |-- female_population: integer (nullable = true)
 |-- total_population: integer (nullable = true)
 |-- foreignborn: integer (nullable = true)
 |-- average_household_size: double (nullable = true)
 |-- state_code: string (nullable = true)
 |-- state: string (nullable = true)



In [60]:
city_demographics.show()

+------------+----------------+----------+---------------+-----------------+----------------+-----------+----------------------+----------+--------------+
|prtl_city_id|            city|median_age|male_population|female_population|total_population|foreignborn|average_household_size|state_code|         state|
+------------+----------------+----------+---------------+-----------------+----------------+-----------+----------------------+----------+--------------+
|         COS|Colorado Springs|      34.8|         225544|           231018|          456562|      35320|                  2.48|        CO|      Colorado|
|         WPB| West Palm Beach|      39.6|          49262|            57520|          106782|      30675|    2.5300000000000002|        FL|       Florida|
|         FTL| Fort Lauderdale|      42.8|          93948|            84639|          178587|      47582|                  2.38|        FL|       Florida|
|         SLC|  Salt Lake City|      32.1|          98364|            

## dates

In [61]:
dates = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'dates.parquet')) 
dates.show()

+------+----------+---+-------+----+-----+
|i_date|   dt_date|day|weekday|year|month|
+------+----------+---+-------+----+-----+
|  9709|1986-08-01|  1|      6|1986|    8|
|  9710|1986-08-02|  2|      7|1986|    8|
|  9711|1986-08-03|  3|      1|1986|    8|
|  9712|1986-08-04|  4|      2|1986|    8|
|  9713|1986-08-05|  5|      3|1986|    8|
|  9714|1986-08-06|  6|      4|1986|    8|
|  9715|1986-08-07|  7|      5|1986|    8|
|  9716|1986-08-08|  8|      6|1986|    8|
|  9717|1986-08-09|  9|      7|1986|    8|
|  9718|1986-08-10| 10|      1|1986|    8|
|  9719|1986-08-11| 11|      2|1986|    8|
|  9720|1986-08-12| 12|      3|1986|    8|
|  9721|1986-08-13| 13|      4|1986|    8|
|  9722|1986-08-14| 14|      5|1986|    8|
|  9723|1986-08-15| 15|      6|1986|    8|
|  9724|1986-08-16| 16|      7|1986|    8|
|  9725|1986-08-17| 17|      1|1986|    8|
|  9726|1986-08-18| 18|      2|1986|    8|
|  9727|1986-08-19| 19|      3|1986|    8|
|  9728|1986-08-20| 20|      4|1986|    8|
+------+---

In [86]:
dates.columns

['i_date', 'dt_date', 'day', 'weekday', 'year', 'month']

## visa

In [62]:
visa = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'visa.parquet')) 

In [63]:
visa.show()

+-------+--------+
|visa_id|    visa|
+-------+--------+
|      1|Business|
|      2|Pleasure|
|      3| Student|
+-------+--------+



In [64]:
visa = spark.read.option("mergeSchema", "true").parquet(os.path.join('/Users/christian/Data/udacity_capstone/output2/output_test/visa.parquet')) 

In [65]:
visa.show()

+-------+--------+
|visa_id|    visa|
+-------+--------+
|      1|Business|
|      2|Pleasure|
|      3| Student|
+-------+--------+



## temperature_global

In [66]:
temperature_global = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'temperature_global.parquet')) 

In [67]:
temperature_global.columns

['year',
 'land_average_temperature',
 'land_average_temperature_uncertainty',
 'land_max_temperature',
 'land_max_temperature_uncertainty',
 'land_min_temperature',
 'land_min_temperature_uncertainty',
 'land_and_ocean_average_temperature',
 'land_and_ocean_average_temperature_uncertainty']

In [68]:
temperature_global.printSchema()

root
 |-- year: integer (nullable = true)
 |-- land_average_temperature: double (nullable = true)
 |-- land_average_temperature_uncertainty: double (nullable = true)
 |-- land_max_temperature: double (nullable = true)
 |-- land_max_temperature_uncertainty: double (nullable = true)
 |-- land_min_temperature: double (nullable = true)
 |-- land_min_temperature_uncertainty: double (nullable = true)
 |-- land_and_ocean_average_temperature: double (nullable = true)
 |-- land_and_ocean_average_temperature_uncertainty: double (nullable = true)



In [92]:
temperature_global.where('year = 1990').show()

+----+------------------------+------------------------------------+--------------------+--------------------------------+--------------------+--------------------------------+----------------------------------+----------------------------------------------+
|year|land_average_temperature|land_average_temperature_uncertainty|land_max_temperature|land_max_temperature_uncertainty|land_min_temperature|land_min_temperature_uncertainty|land_and_ocean_average_temperature|land_and_ocean_average_temperature_uncertainty|
+----+------------------------+------------------------------------+--------------------+--------------------------------+--------------------+--------------------------------+----------------------------------+----------------------------------------------+
|1990|                    0.13|                               3.659|               0.086|                           0.057|              15.629|                          14.958|                             9.234|            

# Fact Table

In [70]:
immi = spark.read.option("mergeSchema", "true").parquet(os.path.join(OUTPUTFOLDER, 'immigration.parquet')) 

In [71]:
immi.columns

['cicid',
 'i_yr',
 'i_mon',
 'arrdate',
 'depdate',
 'i_cit',
 'i_res',
 'i_port',
 'i_mode',
 'i_addr',
 'i_bir',
 'i_visa',
 'visatype',
 'gender',
 'airline',
 'fltno',
 'length_stay']

In [72]:
immi.show()

+------+----+-----+-------+-------+-----+-----+------+------+------+-----+------+--------+------+-------+-----+-----------+
| cicid|i_yr|i_mon|arrdate|depdate|i_cit|i_res|i_port|i_mode|i_addr|i_bir|i_visa|visatype|gender|airline|fltno|length_stay|
+------+----+-----+-------+-------+-----+-----+------+------+------+-----+------+--------+------+-------+-----+-----------+
|459651|2016|    4|  20547|  20559|  135|  135|   ATL|     1|    FL|   54|     2|      WT|  null|     VS|00115|         12|
|459652|2016|    4|  20547|  20555|  135|  135|   ATL|     1|    FL|   74|     2|      WT|     F|     VS|  103|          8|
|459653|2016|    4|  20547|  20557|  135|  135|   ATL|     1|    FL|   44|     2|      B2|     M|     VS|  109|         10|
|459654|2016|    4|  20547|  20555|  135|  135|   ATL|     1|     G|   38|     2|      WT|  null|     VS|00103|          8|
|459655|2016|    4|  20547|   null|  135|  135|   ATL|     1|    GA|   64|     2|      WT|     F|     VS|00103|       null|
|459656|

In [73]:
immi.printSchema()

root
 |-- cicid: integer (nullable = true)
 |-- i_yr: integer (nullable = true)
 |-- i_mon: integer (nullable = true)
 |-- arrdate: integer (nullable = true)
 |-- depdate: integer (nullable = true)
 |-- i_cit: integer (nullable = true)
 |-- i_res: integer (nullable = true)
 |-- i_port: string (nullable = true)
 |-- i_mode: integer (nullable = true)
 |-- i_addr: string (nullable = true)
 |-- i_bir: integer (nullable = true)
 |-- i_visa: integer (nullable = true)
 |-- visatype: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- fltno: string (nullable = true)
 |-- length_stay: integer (nullable = true)



In [75]:
count_missings(immi)

Unnamed: 0,count
depdate,103422
length_stay,103422
i_mode,102
i_bir,19
cicid,0
i_yr,0
i_mon,0
arrdate,0
i_cit,0
i_res,0


In [78]:
103422 / immi.count()

0.039159751430124935