In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
import pyspark.sql.functions as F
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import col
from pyspark.sql.functions import *

import pandas as pd
pd.set_option('max_columns', None)

In [2]:
def create_spark_session():
    """
    spark configuration.
    """
    print("Start the application")
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.8.5") \
        .getOrCreate()
    return spark

In [3]:
# local paths in S3
root_path = "/home/alison/curso/ED-Udacity/capstone-athena/projeto-udacity/data/"
global_temperatures_path = root_path + "temperatures/GlobalTemperatures.csv"
global_land_temperatures_by_state_path = root_path + "temperatures/GlobalLandTemperaturesByState.csv"
global_land_temperatures_by_major_city_path = root_path + "temperatures/GlobalLandTemperaturesByMajorCity.csv"
global_land_temperatures_by_country_path = root_path + "temperatures/GlobalLandTemperaturesByCountry.csv"
global_land_temperatures_by_city_path = root_path + "temperatures/GlobalLandTemperaturesByCity.csv"
us_cities_demographics_path = root_path + "us_cities_demographics.csv"
airport_codes_csv_path = root_path + "airport_codes_csv.csv"
country_path = root_path + "country.csv"
transport_vehicle_path = root_path + "transport_vehicle.csv"
state_usa_path = root_path + 'state_usa.csv'
motivation_path = root_path + 'motivation.csv'
immigration_path = root_path + 'immigration_data_sample.csv'
port_path = root_path + 'port.csv'


In [4]:
def load_temp_state(spark):
    global_temperatures_by_state_schema = StructType([\
                                                  StructField("t_state_dt", DateType(), False),
                                                  StructField('t_state_average_temperature', DoubleType(), False),
                                                  StructField('t_state_average_temperature_uncertainty', DoubleType(), False),
                                                  StructField('t_state_state', StringType(), False),
                                                  StructField('t_state_country', StringType(), False)
                                          ]) 
    
    return spark\
    .read\
    .format('com.databricks.spark.csv')\
    .option("sep",",")\
    .option("header", "true")\
    .option("encoding", "UTF-8")\
    .schema(global_temperatures_by_state_schema)\
    .load(global_land_temperatures_by_state_path)

In [5]:
def load_temp_glob(spark):
    global_temperatures_schema = StructType([ \
                                  StructField("t_glob_dt", DateType(), False),           
                                  StructField("t_glob_land_average_temperature", DoubleType(), False),
                                  StructField("t_glob_land_average_temperature_uncertainty", DoubleType(), False),
                                  StructField("t_glob_land_max_temperature", DoubleType(), False), 
                                  StructField("t_glob_land_max_temperature_uncertainty", DoubleType(), False),
                                  StructField("t_glob_land_min_temperature", DoubleType(), False), 
                                  StructField("t_glob_land_min_temperature_uncertainty", DoubleType(), False),
                                  StructField("t_glob_land_and_ocean_average_temperature", DoubleType(), False),
                                  StructField("t_glob_land_and_ocean_average_temperature_uncertainty", DoubleType(), False)           
                            ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option("header", "true")\
            .option("encoding", "UTF-8")\
            .schema(global_temperatures_schema)\
            .load(global_temperatures_path)

In [6]:
def load_temp_major_city(spark):
    global_temp_major_city_schema = StructType([\
                                                StructField("t_m_city_dt", DateType(), False),
                                                StructField("t_m_city_average_temperature", DoubleType(), False),
                                                StructField("t_m_city_average_temperature_uncertainty", DoubleType(), False),
                                                StructField("t_m_city_city", StringType(), False),
                                                StructField("t_m_city_country", StringType(), False),
                                                StructField("t_m_city_latitude", StringType(), False),
                                                StructField("t_m_city_longitude", StringType(), False)
                                ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_temp_major_city_schema)\
            .load(global_land_temperatures_by_major_city_path)

In [7]:
def load_temp_by_country(spark):
    global_land_temp_by_country_schema = StructType([\
                                                      StructField("t_country_dt", DateType(), False),
                                                      StructField("t_country_average_temperature", DoubleType(), False),
                                                      StructField("t_country_average_temperature_uncertainty", DoubleType(), False),
                                                      StructField("t_country_country", StringType(), False)
                                      ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_land_temp_by_country_schema)\
            .load(global_land_temperatures_by_country_path)

In [8]:
def load_temp_by_city(spark):
    global_land_temp_by_city_schema = StructType([\
                                                  StructField("t_city_dt", DateType(), False),
                                                  StructField("t_city_average_temperature", DoubleType(), False),
                                                  StructField("t_city_average_temperature_uncertainty", DoubleType(), False),
                                                  StructField("t_city_city", StringType(), False),
                                                  StructField("t_city_country", StringType(), False),
                                                  StructField("t_city_latitude", StringType(), False),
                                                  StructField("t_city_longitude", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(global_land_temp_by_city_schema)\
            .load(global_land_temperatures_by_city_path)

In [9]:
def load_cities_demographics(spark):
    us_cities_demog_schema = StructType([\
                                        StructField("cit_demog_city", StringType(), False),
                                        StructField("cit_demog_state", StringType(), False),
                                        StructField("cit_demog_median_age", DoubleType(), False),
                                        StructField("cit_demog_male_population", IntegerType(), False),
                                        StructField("cit_demog_female_population", IntegerType(), False),
                                        StructField("cit_demog_total_polulation", IntegerType(), False),
                                        StructField("cit_demog_number_veterans", IntegerType(), False),
                                        StructField("cit_demog_foreign_born", IntegerType(), False),
                                        StructField("cit_demog_average_household_size", DoubleType(), False),
                                        StructField("cit_demog_state_code", StringType(), False),
                                        StructField("cit_demog_race", StringType(), False),
                                        StructField("cit_demog_quant", IntegerType(), False)
                                        ])
    
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(us_cities_demog_schema)\
            .load(us_cities_demographics_path)

In [10]:
def load_airport_codes(spark):
    airport_codes_schema = StructType([\
                                       StructField("airp_ident", StringType(), False),
                                       StructField("airp_type", StringType(), False),
                                       StructField("airp_name", StringType(), False),
                                       StructField("airp_elevation_ft", IntegerType(), False),
                                       StructField("airp_continent", StringType(), False),
                                       StructField("airp_iso_country", StringType(), False),
                                       StructField("airp_iso_region", StringType(), False),
                                       StructField("airp_municipality", StringType(), False),
                                       StructField("airp_gps_code", StringType(), False),
                                       StructField("airp_iata_code", StringType(), False),
                                       StructField("airp_local_code", StringType(), False),
                                       StructField("airp_coordinates", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(airport_codes_schema)\
            .load(airport_codes_csv_path)

In [11]:
def load_country(spark):
    country_schema = StructType([\
                                  StructField("country_code", IntegerType(), False),
                                  StructField("country_name", StringType(), False)
                                 ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(country_schema)\
            .load(country_path)

In [12]:
def load_transport_vehicle(spark):
    transport_vehicle_schema = StructType([\
                                           StructField("vehi_code", IntegerType(), False),
                                           StructField("vehi_name", StringType(), False)
                                          ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(transport_vehicle_schema)\
            .load(transport_vehicle_path)

In [13]:
def load_state_usa(spark):
    state_usa_schema = StructType([\
                                   StructField("state_usa_code", StringType(), False),
                                   StructField("state_usa_name", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(state_usa_schema)\
            .load(state_usa_path)

In [14]:
def load_motivation(spark):
    motivation_schema = StructType([\
                                    StructField("motiv_code", IntegerType(), False),
                                    StructField("motiv_name", StringType(), False)
                                   ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(motivation_schema)\
            .load(motivation_path)

In [15]:
def load_immigration(spark):
    immigration_schema = StructType([\
                                   StructField("immig_passender_id", IntegerType(), False),
                                   StructField("immig_cicid", DoubleType(), False),
                                   StructField("immig_i94yr", DoubleType(), False),
                                   StructField("immig_i94mon", DoubleType(), False),
                                   StructField("immig_i94cit", DoubleType(), False),
                                   StructField("immig_i94res", DoubleType(), False),
                                   StructField("immig_i94port", StringType(), False),
                                   StructField("immig_arrdate", DoubleType(), False),
                                   StructField("immig_i94mode", DoubleType(), False),
                                   StructField("immig_i94addr", StringType(), False),
                                   StructField("immig_depdate", DoubleType(), False),
                                   StructField("immig_i94bir", DoubleType(), False),
                                   StructField("immig_i94visa", DoubleType(), False),
                                   StructField("immig_count", DoubleType(), False),
                                   StructField("immig_dtadfile", StringType(), False),
                                   StructField("immig_visapost", StringType(), False),
                                   StructField("immig_occup", StringType(), False),
                                   StructField("immig_entdepa", StringType(), False),
                                   StructField("immig_entdepd", StringType(), False),
                                   StructField("immig_entdepu", StringType(), False),
                                   StructField("immig_matflag", StringType(), False),
                                   StructField("immig_biryear", DoubleType(), False),
                                   StructField("immig_dtaddto", StringType(), False),
                                   StructField("immig_gender", StringType(), False),
                                   StructField("immig_insnum", StringType(), False),
                                   StructField("immig_airline", StringType(), False),
                                   StructField("immig_admnum", DoubleType(), False),
                                   StructField("immig_fltno", StringType(), False),
                                   StructField("immig_visatype", StringType(), False)
                                  ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ',')\
            .option('header', 'true')\
            .option('encoding', 'utf-8')\
            .schema(immigration_schema)\
            .load(immigration_path)

In [18]:
def load_port(spark):
    port_schema = StructType([\
                               StructField("port_code", StringType(), False),
                               StructField("port_name", StringType(), False)
                             ])
    
    df_port = spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep', ';')\
            .option('header', 'false')\
            .option('encoding', 'utf-8')\
            .schema(port_schema)\
            .load(port_path)

    return df_port\
            .withColumn('column_drop', F.split(df_port['port_name'], ','))\
            .withColumn('port_portal', trim(F.col('column_drop')[0]))\
            .withColumn('port_country_acronym', trim(F.col('column_drop')[1]))\
            .drop('column_drop')

In [23]:
def big_table(spark):
    
    df_temp_glob = load_temp_glob(spark).distinct()
    
    df_temp_state = load_temp_state(spark).select("t_state_state", "t_state_country").distinct()
    
    df_temp_major_city = load_temp_major_city(spark).limit(1000)
    
    df_temp_country = load_temp_by_country(spark).limit(1000)
    
    df_temp_city = load_temp_by_city(spark).select("t_city_country", "t_city_city").distinct()
    
    df_cities_demog = load_cities_demographics(spark).select("cit_demog_city").distinct()
    
    df_airport_codes = load_airport_codes(spark).limit(1000)
    
    df_country = load_country(spark).select("country_name", "country_code").distinct()
    
    df_transport_vehicle = load_transport_vehicle(spark).limit(1000)
    
    df_state_usa = load_state_usa(spark).distinct()
    
    df_motivation = load_motivation(spark)

    df_immigration = load_immigration(spark)
    
    df_port = load_port(spark)
    
    df_join_state_temp_glob = df_state_usa\
    .join(df_temp_state, upper(df_state_usa.state_usa_name) == upper(df_temp_state.t_state_state))
    
    df_join = df_join_state_temp_glob\
    .join(df_temp_city, upper(df_join_state_temp_glob.t_state_country) == upper(df_temp_city.t_city_country))\
    .join(df_cities_demog, upper(df_temp_city.t_city_city) == upper(df_cities_demog.cit_demog_city))\
    .join(df_airport_codes, upper(df_airport_codes.airp_municipality) == upper(df_cities_demog.cit_demog_city))\
    .join(df_country, upper(df_country.country_name) == upper(df_temp_state.t_state_country))\
    .join(df_immigration, upper(df_immigration.immig_i94addr) == upper(df_join_state_temp_glob.state_usa_code))\
    .join(df_transport_vehicle, df_immigration.immig_i94mode == df_transport_vehicle.vehi_code)\
    .join(df_motivation, df_motivation.motiv_code == df_immigration.immig_i94visa)
    
    return df_join

In [24]:
def main():
    
    save_path = "/home/alison/curso/ED-Udacity/capstone-pyspark/results/graph"
    
    spark = create_spark_session()
    
    df_join = big_table(spark)
    
    df_join.write.format('com.databricks.spark.csv') \
     .mode('overwrite').option("header", "true").save(save_path)
    
    return df_join
    
#     até aqui tudo esta funcionando, agora falta conectar as tabelas satelites 

In [None]:
result = main()

result.toPandas()

Start the application
