In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
import pyspark.sql.functions as psf
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import col
from pyspark.sql.functions import *

In [2]:
def create_spark_session():
    """
    spark configuration.
    """
    print("Start the application")
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.8.5") \
        .getOrCreate()
    return spark

In [36]:
# local paths in S3
root_path = "/home/alison/curso/ED-Udacity/capstone-athena/projeto-udacity/data/"
global_temperatures_path = root_path + "temperatures/GlobalTemperatures.csv"
global_land_temperatures_by_state_path = root_path + "temperatures/GlobalLandTemperaturesByState.csv"
global_land_temperatures_by_major_city_path = root_path + "temperatures/GlobalLandTemperaturesByMajorCity.csv"
global_land_temperatures_by_country_path = root_path + "temperatures/GlobalLandTemperaturesByCountry.csv"
global_land_temperatures_by_city_path = root_path + "temperatures/GlobalLandTemperaturesByCity.csv"
us_cities_demographics_path = root_path + "us_cities_demographics.csv"
airport_codes_csv_path = root_path + "airport_codes_csv.csv"
country_path = root_path + "country.csv"
transport_vehicle_path = root_path + "transport_vehicle.csv"
state_usa_path = root_path + 'state_usa.csv'
motivation_path = root_path + 'motivation.csv'
immigration_path = root_path + 'immigration_data_sample.csv'

In [4]:
def load_glob_temp_state(spark):
    global_temperatures_by_state_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField('average_temperature', DoubleType(), False),
                                                  StructField('average_temperature_uncertainty', DoubleType(), False),
                                                  StructField('state', StringType(), False),
                                                  StructField('country', StringType(), False)
                                          ]) 
    
    return spark\
    .read\
    .format('com.databricks.spark.csv')\
    .option("sep",",")\
    .option("header", "true")\
    .option("encoding", "UTF-8")\
    .schema(global_temperatures_by_state_schema)\
    .load(global_land_temperatures_by_state_path)

In [5]:
def load_glob_temp(spark):
    global_temperatures_schema = StructType([ \
                                  StructField("dt", DateType(), False),           
                                  StructField("land_average_temperature", DoubleType(), False),
                                  StructField("land_average_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_max_temperature", DoubleType(), False), 
                                  StructField("land_max_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_min_temperature", DoubleType(), False), 
                                  StructField("land_min_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature_uncertainty", DoubleType(), False)           
                            ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option("header", "true")\
            .option("encoding", "UTF-8")\
            .schema(global_temperatures_schema)\
            .load(global_temperatures_path)

In [6]:
def load_global_land_temp_major_city(spark):
    global_temp_major_city_schema = StructType([\
                                                StructField("dt", DateType(), False),
                                                StructField("average_temperature", DoubleType(), False),
                                                StructField("average_temperature_uncertainty", DoubleType(), False),
                                                StructField("city", StringType(), False),
                                                StructField("country", StringType(), False),
                                                StructField("latitude", StringType(), False),
                                                StructField("longitude", StringType(), False)
                                ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_temp_major_city_schema)\
            .load(global_land_temperatures_by_major_city_path)

In [7]:
def load_global_land_temp_by_country(spark):
    global_land_temp_by_country_schema = StructType([\
                                                      StructField("dt", DateType(), False),
                                                      StructField("average_temperature", DoubleType(), False),
                                                      StructField("average_temperature_uncertainty", DoubleType(), False),
                                                      StructField("country", StringType(), False)
                                      ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_land_temp_by_country_schema)\
            .load(global_land_temperatures_by_country_path)

In [8]:
def load_global_land_temp_by_city(spark):
    global_land_temp_by_city_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField("average_temperature", DoubleType(), False),
                                                  StructField("average_temperature_uncertainty", DoubleType(), False),
                                                  StructField("city", StringType(), False),
                                                  StructField("country", StringType(), False),
                                                  StructField("latitude", StringType(), False),
                                                  StructField("longitude", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(global_land_temp_by_city_schema)\
            .load(global_land_temperatures_by_city_path)

In [9]:
def load_us_cities_demographics(spark):
    us_cities_demog_schema = StructType([\
                                        StructField("city", StringType(), False),
                                        StructField("state", StringType(), False),
                                        StructField("median_age", DoubleType(), False),
                                        StructField("male_population", IntegerType(), False),
                                        StructField("female_population", IntegerType(), False),
                                        StructField("total_polulation", IntegerType(), False),
                                        StructField("number_veterans", IntegerType(), False),
                                        StructField("foreign_born", IntegerType(), False),
                                        StructField("average_household_size", DoubleType(), False),
                                        StructField("state_code", StringType(), False),
                                        StructField("race", StringType(), False),
                                        StructField("quant", IntegerType(), False)
                                        ])
    
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(us_cities_demog_schema)\
            .load(us_cities_demographics_path)

In [10]:
def load_airport_codes(spark):
    airport_codes_schema = StructType([\
                                       StructField("ident", StringType(), False),
                                       StructField("type", StringType(), False),
                                       StructField("name", StringType(), False),
                                       StructField("elevation_ft", IntegerType(), False),
                                       StructField("continent", StringType(), False),
                                       StructField("iso_country", StringType(), False),
                                       StructField("iso_region", StringType(), False),
                                       StructField("municipality", StringType(), False),
                                       StructField("gps_code", StringType(), False),
                                       StructField("iata_code", StringType(), False),
                                       StructField("local_code", StringType(), False),
                                       StructField("coordinates", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(airport_codes_schema)\
            .load(airport_codes_csv_path)

In [11]:
def load_country(spark):
    country_schema = StructType([\
                                  StructField("code", IntegerType(), False),
                                  StructField("name", StringType(), False)
                                 ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(country_schema)\
            .load(country_path)

In [12]:
def load_transport_vehicle(spark):
    transport_vehicle_schema = StructType([\
                                           StructField("code", IntegerType(), False),
                                           StructField("name", StringType(), False)
                                          ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(transport_vehicle_schema)\
            .load(transport_vehicle_path)

In [13]:
def load_state_usa(spark):
    state_usa_schema = StructType([\
                                   StructField("code", IntegerType(), False),
                                   StructField("name", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(state_usa_schema)\
            .load(state_usa_path)

In [14]:
def load_motivation(spark):
    motivation_schema = StructType([\
                                    StructField("code", IntegerType(), False),
                                    StructField("name", StringType(), False)
                                   ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(motivation_schema)\
            .load(motivation_path)

In [43]:
def load_immigration(spark):
    immigration_schema = StructType([\
                                   StructField("passender_id", IntegerType(), False),
                                   StructField("cicid", DoubleType(), False),
                                   StructField("i94yr", DoubleType(), False),
                                   StructField("i94mon", DoubleType(), False),
                                   StructField("i94cit", DoubleType(), False),
                                   StructField("i94res", DoubleType(), False),
                                   StructField("i94port", StringType(), False),
                                   StructField("arrdate", DoubleType(), False),
                                   StructField("i94mode", DoubleType(), False),
                                   StructField("i94addr", StringType(), False),
                                   StructField("depdate", DoubleType(), False),
                                   StructField("i94bir", DoubleType(), False),
                                   StructField("i94visa", DoubleType(), False),
                                   StructField("count", DoubleType(), False),
                                   StructField("dtadfile", StringType(), False),
                                   StructField("visapost", StringType(), False),
                                   StructField("occup", StringType(), False),
                                   StructField("entdepa", StringType(), False),
                                   StructField("entdepd", StringType(), False),
                                   StructField("entdepu", StringType(), False),
                                   StructField("matflag", StringType(), False),
                                   StructField("biryear", DoubleType(), False),
                                   StructField("dtaddto", StringType(), False),
                                   StructField("gender", StringType(), False),
                                   StructField("insnum", StringType(), False),
                                   StructField("airline", StringType(), False),
                                   StructField("admnum", DoubleType(), False),
                                   StructField("fltno", StringType(), False),
                                   StructField("visatype", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ',')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(immigration_schema)\
            .load(immigration_path)

In [46]:
def main():
    
    spark = create_spark_session()
    
    df_glob_temp = load_glob_temp(spark).distinct()
#     df_glob_temp.show(truncate=False)
#     df_glob_temp.printSchema()
    
    
    df_glob_temp_state = load_glob_temp_state(spark).select("state", "country").distinct()
#     df_glob_temp_state.show()
#     df_glob_temp_state.printSchema()
    
    df_glob_temp_major_city = load_global_land_temp_major_city(spark).limit(1000)
#     df_glob_temp_major_city.show(truncate = False)
#     df_glob_temp_major_city.printSchema()
    
    df_glob_temp_country = load_global_land_temp_by_country(spark).limit(1000)
#     df_glob_temp_country.show(truncate = False)
#     df_glob_temp_country.printSchema()
    
    df_glob_temp_city = load_global_land_temp_by_city(spark).select("country", "city").distinct()
#     df_glob_temp_city.show(truncate=False)
#     df_glob_temp_city.printSchema()
    
    df_us_cities_demog = load_us_cities_demographics(spark).select("city").distinct()
#     df_us_cities_demog.show(truncate=False)
#     df_us_cities_demog.printSchema()
    
    df_airport_codes = load_airport_codes(spark).limit(1000)
#     df_airport_codes_csv.show(truncate=False)
#     df_airport_codes_csv.printSchema()
    
    df_country = load_country(spark).select("name", "code").distinct()
#     df_country.show(truncate=False)
#     df_country.printSchema()
    
    df_transport_vehicle = load_transport_vehicle(spark).limit(1000)
#     df_transport_vehicle.show(truncate=False)
#     df_transport_vehicle.printSchema()
    
    df_state_usa = load_state_usa(spark).select("name").distinct()
#     df_state_usa.show(truncate=False)
#     df_state_usa.printSchema()
    
    df_motivation = load_motivation(spark)
#     df_motivation.show(truncate=False)
#     df_motivation.printSchema()

    df_immigration = load_immigration(spark)
    
    
    
#     df_join_state_glob_temp = df_state_usa\
#     .join(df_glob_temp_state, upper(df_state_usa.name) == upper(df_glob_temp_state.state))
    
#     df_join = df_join_state_glob_temp\
#     .join(df_glob_temp_city, upper(df_join_state_glob_temp.country) == upper(df_glob_temp_city.country))\
#     .join(df_us_cities_demog, upper(df_glob_temp_city.city) == upper(df_us_cities_demog.city))\
#     .join(df_airport_codes, upper(df_airport_codes.municipality) == upper(df_us_cities_demog.city))\
#     .join(df_country, upper(df_country.name) == upper(df_glob_temp_state.country))
    
#     df_airport_codes.select("iata_code","iso_country", "iso_region","local_code", "coordinates").show()
#     df_country.show()
#     df_country.filter(upper(df_country.name) == 'UNITED STATES').show()
#     df_glob_temp_state.filter(upper(df_glob_temp_state.country) == 'UNITED STATES').show()
    
#     df_join = df_glob_temp_state.join(df_country, upper(df_country.name) == upper(df_glob_temp_state.country))
    
#     df_join.limit(10).show()
    
#     dfjoin = df_airport_codes.join(df_us_cities_demog, df_airport_codes.municipality == df_us_cities_demog.city)
#     dfjoin.select("municipality", "city").show(truncate=False)

    df_immigration.select("i94cit", "i94res", "i94port").show()
    df_airport_codes.select("iata_code","iso_country", "iso_region","local_code", "coordinates").show()
    
    df_immigraion

In [47]:
main()

Start the application
+------+------+-------+
|i94cit|i94res|i94port|
+------+------+-------+
| 209.0| 209.0|    HHW|
| 582.0| 582.0|    MCA|
| 148.0| 112.0|    OGG|
| 297.0| 297.0|    LOS|
| 111.0| 111.0|    CHM|
| 577.0| 577.0|    ATL|
| 245.0| 245.0|    SFR|
| 113.0| 135.0|    NYC|
| 131.0| 131.0|    CHI|
| 116.0| 116.0|    LOS|
| 438.0| 438.0|    LOS|
| 209.0| 209.0|    PHI|
| 148.0| 112.0|    FTL|
| 260.0| 260.0|    LOS|
| 148.0| 112.0|    BOS|
| 245.0| 245.0|    SAI|
| 512.0| 512.0|    NAS|
| 689.0| 689.0|    FTL|
| 746.0| 158.0|    SEA|
| 260.0| 260.0|    FTL|
+------+------+-------+
only showing top 20 rows

+---------+-----------+----------+----------+--------------------+
|iata_code|iso_country|iso_region|local_code|         coordinates|
+---------+-----------+----------+----------+--------------------+
|     null|         US|     US-PA|       00A|-74.9336013793945...|
|     null|         US|     US-KS|      00AA|-101.473911, 38.7...|
|     null|         US|     US-AK|      0