In [21]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
import pyspark.sql.functions as F
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import col
from pyspark.sql.functions import *

import pandas as pd
pd.set_option('max_columns', None)

In [2]:
def create_spark_session():
    """
    spark configuration.
    """
    print("Start the application")
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.8.5") \
        .getOrCreate()
    return spark

In [3]:
# local paths in S3
root_path = "/home/alison/curso/ED-Udacity/capstone-athena/projeto-udacity/data/"
global_temperatures_path = root_path + "temperatures/GlobalTemperatures.csv"
global_land_temperatures_by_state_path = root_path + "temperatures/GlobalLandTemperaturesByState.csv"
global_land_temperatures_by_major_city_path = root_path + "temperatures/GlobalLandTemperaturesByMajorCity.csv"
global_land_temperatures_by_country_path = root_path + "temperatures/GlobalLandTemperaturesByCountry.csv"
global_land_temperatures_by_city_path = root_path + "temperatures/GlobalLandTemperaturesByCity.csv"
us_cities_demographics_path = root_path + "us_cities_demographics.csv"
airport_codes_csv_path = root_path + "airport_codes_csv.csv"
country_path = root_path + "country.csv"
transport_vehicle_path = root_path + "transport_vehicle.csv"
state_usa_path = root_path + 'state_usa.csv'
motivation_path = root_path + 'motivation.csv'
immigration_path = root_path + 'immigration_data_sample.csv'
port_path = root_path + 'port.csv'


In [4]:
def load_glob_temp_state(spark):
    global_temperatures_by_state_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField('average_temperature', DoubleType(), False),
                                                  StructField('average_temperature_uncertainty', DoubleType(), False),
                                                  StructField('state', StringType(), False),
                                                  StructField('country', StringType(), False)
                                          ]) 
    
    return spark\
    .read\
    .format('com.databricks.spark.csv')\
    .option("sep",",")\
    .option("header", "true")\
    .option("encoding", "UTF-8")\
    .schema(global_temperatures_by_state_schema)\
    .load(global_land_temperatures_by_state_path)

In [5]:
def load_glob_temp(spark):
    global_temperatures_schema = StructType([ \
                                  StructField("dt", DateType(), False),           
                                  StructField("land_average_temperature", DoubleType(), False),
                                  StructField("land_average_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_max_temperature", DoubleType(), False), 
                                  StructField("land_max_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_min_temperature", DoubleType(), False), 
                                  StructField("land_min_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature_uncertainty", DoubleType(), False)           
                            ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option("header", "true")\
            .option("encoding", "UTF-8")\
            .schema(global_temperatures_schema)\
            .load(global_temperatures_path)

In [6]:
def load_global_land_temp_major_city(spark):
    global_temp_major_city_schema = StructType([\
                                                StructField("dt", DateType(), False),
                                                StructField("average_temperature", DoubleType(), False),
                                                StructField("average_temperature_uncertainty", DoubleType(), False),
                                                StructField("city", StringType(), False),
                                                StructField("country", StringType(), False),
                                                StructField("latitude", StringType(), False),
                                                StructField("longitude", StringType(), False)
                                ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_temp_major_city_schema)\
            .load(global_land_temperatures_by_major_city_path)

In [7]:
def load_global_land_temp_by_country(spark):
    global_land_temp_by_country_schema = StructType([\
                                                      StructField("dt", DateType(), False),
                                                      StructField("average_temperature", DoubleType(), False),
                                                      StructField("average_temperature_uncertainty", DoubleType(), False),
                                                      StructField("country", StringType(), False)
                                      ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_land_temp_by_country_schema)\
            .load(global_land_temperatures_by_country_path)

In [8]:
def load_global_land_temp_by_city(spark):
    global_land_temp_by_city_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField("average_temperature", DoubleType(), False),
                                                  StructField("average_temperature_uncertainty", DoubleType(), False),
                                                  StructField("city", StringType(), False),
                                                  StructField("country", StringType(), False),
                                                  StructField("latitude", StringType(), False),
                                                  StructField("longitude", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(global_land_temp_by_city_schema)\
            .load(global_land_temperatures_by_city_path)

In [9]:
def load_us_cities_demographics(spark):
    us_cities_demog_schema = StructType([\
                                        StructField("city", StringType(), False),
                                        StructField("state", StringType(), False),
                                        StructField("median_age", DoubleType(), False),
                                        StructField("male_population", IntegerType(), False),
                                        StructField("female_population", IntegerType(), False),
                                        StructField("total_polulation", IntegerType(), False),
                                        StructField("number_veterans", IntegerType(), False),
                                        StructField("foreign_born", IntegerType(), False),
                                        StructField("average_household_size", DoubleType(), False),
                                        StructField("state_code", StringType(), False),
                                        StructField("race", StringType(), False),
                                        StructField("quant", IntegerType(), False)
                                        ])
    
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(us_cities_demog_schema)\
            .load(us_cities_demographics_path)

In [10]:
def load_airport_codes(spark):
    airport_codes_schema = StructType([\
                                       StructField("ident", StringType(), False),
                                       StructField("type", StringType(), False),
                                       StructField("name", StringType(), False),
                                       StructField("elevation_ft", IntegerType(), False),
                                       StructField("continent", StringType(), False),
                                       StructField("iso_country", StringType(), False),
                                       StructField("iso_region", StringType(), False),
                                       StructField("municipality", StringType(), False),
                                       StructField("gps_code", StringType(), False),
                                       StructField("iata_code", StringType(), False),
                                       StructField("local_code", StringType(), False),
                                       StructField("coordinates", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(airport_codes_schema)\
            .load(airport_codes_csv_path)

In [11]:
def load_country(spark):
    country_schema = StructType([\
                                  StructField("code", IntegerType(), False),
                                  StructField("name", StringType(), False)
                                 ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(country_schema)\
            .load(country_path)

In [12]:
def load_transport_vehicle(spark):
    transport_vehicle_schema = StructType([\
                                           StructField("code", IntegerType(), False),
                                           StructField("name", StringType(), False)
                                          ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(transport_vehicle_schema)\
            .load(transport_vehicle_path)

In [13]:
def load_state_usa(spark):
    state_usa_schema = StructType([\
                                   StructField("code", StringType(), False),
                                   StructField("name", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(state_usa_schema)\
            .load(state_usa_path)

In [14]:
def load_motivation(spark):
    motivation_schema = StructType([\
                                    StructField("code", IntegerType(), False),
                                    StructField("name", StringType(), False)
                                   ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(motivation_schema)\
            .load(motivation_path)

In [15]:
def load_immigration(spark):
    immigration_schema = StructType([\
                                   StructField("passender_id", IntegerType(), False),
                                   StructField("cicid", DoubleType(), False),
                                   StructField("i94yr", DoubleType(), False),
                                   StructField("i94mon", DoubleType(), False),
                                   StructField("i94cit", DoubleType(), False),
                                   StructField("i94res", DoubleType(), False),
                                   StructField("i94port", StringType(), False),
                                   StructField("arrdate", DoubleType(), False),
                                   StructField("i94mode", DoubleType(), False),
                                   StructField("i94addr", StringType(), False),
                                   StructField("depdate", DoubleType(), False),
                                   StructField("i94bir", DoubleType(), False),
                                   StructField("i94visa", DoubleType(), False),
                                   StructField("count", DoubleType(), False),
                                   StructField("dtadfile", StringType(), False),
                                   StructField("visapost", StringType(), False),
                                   StructField("occup", StringType(), False),
                                   StructField("entdepa", StringType(), False),
                                   StructField("entdepd", StringType(), False),
                                   StructField("entdepu", StringType(), False),
                                   StructField("matflag", StringType(), False),
                                   StructField("biryear", DoubleType(), False),
                                   StructField("dtaddto", StringType(), False),
                                   StructField("gender", StringType(), False),
                                   StructField("insnum", StringType(), False),
                                   StructField("airline", StringType(), False),
                                   StructField("admnum", DoubleType(), False),
                                   StructField("fltno", StringType(), False),
                                   StructField("visatype", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ',')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(immigration_schema)\
            .load(immigration_path)

In [16]:
def load_port(spark):
    port_schema = StructType([\
                               StructField("code", StringType(), False),
                               StructField("name", StringType(), False)
                             ])
    
    df_port = spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'false')\
            .option('encoding', 'utf-8')\
            .schema(port_schema)\
            .load(port_path)

    return df_port\
            .withColumn('column_drop', F.split(df_port['name'], ','))\
            .withColumn('portal', trim(F.col('column_drop')[0]))\
            .withColumn('country_acronym', trim(F.col('column_drop')[1]))\
            .drop('column_drop')

In [34]:
def main():
    
    spark = create_spark_session()
    
    df_glob_temp = load_glob_temp(spark).distinct()
    
    df_glob_temp_state = load_glob_temp_state(spark).select("state", "country").distinct()
    
    df_glob_temp_major_city = load_global_land_temp_major_city(spark).limit(1000)
    
    df_glob_temp_country = load_global_land_temp_by_country(spark).limit(1000)
    
    df_glob_temp_city = load_global_land_temp_by_city(spark).select("country", "city").distinct()
    
    df_us_cities_demog = load_us_cities_demographics(spark).select("city").distinct()
    
    df_airport_codes = load_airport_codes(spark).limit(1000)
    
    df_country = load_country(spark).select("name", "code").distinct()
    
    df_transport_vehicle = load_transport_vehicle(spark).limit(1000)
    
    df_state_usa = load_state_usa(spark).distinct()
    
    df_motivation = load_motivation(spark)

    df_immigration = load_immigration(spark)
    
    df_port = load_port(spark)
    
    df_join_state_glob_temp = df_state_usa\
    .join(df_glob_temp_state, upper(df_state_usa.name) == upper(df_glob_temp_state.state))
    
    df_join = df_join_state_glob_temp\
    .join(df_glob_temp_city, upper(df_join_state_glob_temp.country) == upper(df_glob_temp_city.country))\
    .join(df_us_cities_demog, upper(df_glob_temp_city.city) == upper(df_us_cities_demog.city))\
    .join(df_airport_codes, upper(df_airport_codes.municipality) == upper(df_us_cities_demog.city))\
    .join(df_country, upper(df_country.name) == upper(df_glob_temp_state.country))\
    .join(df_immigration, upper(df_immigration.i94addr) == upper(df_join_state_glob_temp.code))\
    .limit(5)

    
    
    print(df_join.limit(5).toPandas().head()) 
    
#     até aqui tudo esta funcionando, agora falta conectar as tabelas satelites 

In [35]:
main()

Start the application
   code   name  state        country        country              city  \
0    ID  IDAHO  Idaho  United States  United States  Huntington Beach   
1    ID  IDAHO  Idaho  United States  United States      Indianapolis   
2    ID  IDAHO  Idaho  United States  United States      Indianapolis   
3    ID  IDAHO  Idaho  United States  United States           Detroit   
4    ID  IDAHO  Idaho  United States  United States           Phoenix   

               city ident      type                                name  \
0  Huntington Beach  02CA  heliport  Swepi Beta Platform Ellen Heliport   
1      Indianapolis  0IN8    closed         Roto-Whirl/Vantage Heliport   
2      Indianapolis  0IN7    closed         Roto-Whirl/Holiday Heliport   
3           Detroit  0MI9  heliport        Henry Ford Hospital Heliport   
4           Phoenix  0AZ7  heliport                   Sunstate Heliport   

   elevation_ft continent iso_country iso_region      municipality gps_code  \
0        

In [None]:
spark = create_spark_session()

In [None]:
df_immigration = load_immigration(spark)

In [None]:
df_immigration.limit(10).toPandas().head()

In [None]:
df_port = load_port(spark)
df_port.limit(10).toPandas().head()

In [None]:
load_country(spark).select("name", "code").distinct().toPandas().head()

In [None]:
load_global_land_temp_major_city(spark).distinct().toPandas().head()

In [None]:
load_global_land_temp_by_city(spark).distinct().limit(5).toPandas().head()

In [None]:
load_us_cities_demographics(spark).distinct().limit(5).toPandas().head()

In [None]:
df_state_usa = load_state_usa(spark).distinct()

df_immigration\
    .join(df_state_usa, upper(df_immigration.i94addr) == upper(df_state_usa.code))\
    .limit(5).toPandas().head()

In [None]:
df_temp_state = load_glob_temp_state(spark)

df_temp_state.join(df_state_usa, upper(df_temp_state.state) == upper(df_state_usa.name))\
    .limit(5).toPandas().head()