In [32]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
import pyspark.sql.functions as psf
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
from pyspark.sql.functions import col
from pyspark.sql.functions import *

In [26]:
def create_spark_session():
    """
    spark configuration.
    """
    print("Start the application")
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.8.5") \
        .getOrCreate()
    return spark

In [196]:
# local paths in S3
root_path = "/home/alison/curso/ED-Udacity/capstone-athena/projeto-udacity/data/"
global_temperatures_path = root_path + "temperatures/GlobalTemperatures.csv"
global_land_temperatures_by_state_path = root_path + "temperatures/GlobalLandTemperaturesByState.csv"
global_land_temperatures_by_major_city_path = root_path + "temperatures/GlobalLandTemperaturesByMajorCity.csv"
global_land_temperatures_by_country_path = root_path + "temperatures/GlobalLandTemperaturesByCountry.csv"
global_land_temperatures_by_city_path = root_path + "temperatures/GlobalLandTemperaturesByCity.csv"
us_cities_demographics_path = root_path + "us_cities_demographics.csv"
airport_codes_csv_path = root_path + "airport_codes_csv.csv"
country_path = root_path + "country.csv"

In [178]:
def load_glob_temp_state(spark):
    global_temperatures_by_state_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField('average_temperature', DoubleType(), False),
                                                  StructField('average_temperature_uncertainty', DoubleType(), False),
                                                  StructField('state', StringType(), False),
                                                  StructField('country', StringType(), False)
                                          ]) 
    
    return spark\
    .read\
    .format('com.databricks.spark.csv')\
    .option("sep",",")\
    .option("header", "true")\
    .option("encoding", "UTF-8")\
    .schema(global_temperatures_by_state_schema)\
    .load(global_land_temperatures_by_state_path)

In [179]:
def load_glob_temp(spark):
    global_temperatures_schema = StructType([ \
                                  StructField("dt", DateType(), False),           
                                  StructField("land_average_temperature", DoubleType(), False),
                                  StructField("land_average_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_max_temperature", DoubleType(), False), 
                                  StructField("land_max_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_min_temperature", DoubleType(), False), 
                                  StructField("land_min_temperature_uncertainty", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature", DoubleType(), False),
                                  StructField("land_and_ocean_average_temperature_uncertainty", DoubleType(), False)           
                            ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option("header", "true")\
            .option("encoding", "UTF-8")\
            .schema(global_temperatures_schema)\
            .load(global_temperatures_path)

In [180]:
def load_global_land_temp_major_city(spark):
    global_temp_major_city_schema = StructType([\
                                                StructField("dt", DateType(), False),
                                                StructField("average_temperature", DoubleType(), False),
                                                StructField("average_temperature_uncertainty", DoubleType(), False),
                                                StructField("city", StringType(), False),
                                                StructField("country", StringType(), False),
                                                StructField("latitude", StringType(), False),
                                                StructField("longitude", StringType(), False)
                                ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_temp_major_city_schema)\
            .load(global_land_temperatures_by_major_city_path)

In [181]:
def load_global_land_temp_by_country(spark):
    global_land_temp_by_country_schema = StructType([\
                                                      StructField("dt", DateType(), False),
                                                      StructField("average_temperature", DoubleType(), False),
                                                      StructField("average_temperature_uncertainty", DoubleType(), False),
                                                      StructField("country", StringType(), False)
                                      ])
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "UTF-8")\
            .schema(global_land_temp_by_country_schema)\
            .load(global_land_temperatures_by_country)

In [182]:
def load_global_land_temp_by_city(spark):
    global_land_temp_by_city_schema = StructType([\
                                                  StructField("dt", DateType(), False),
                                                  StructField("average_temperature", DoubleType(), False),
                                                  StructField("average_temperature_uncertainty", DoubleType(), False),
                                                  StructField("city", StringType(), False),
                                                  StructField("country", StringType(), False),
                                                  StructField("latitude", StringType(), False),
                                                  StructField("longitude", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(global_land_temp_by_city_schema)\
            .load(global_land_temperatures_by_city_path)

In [183]:
def load_us_cities_demographics(spark):
    us_cities_demog_schema = StructType([\
                                        StructField("city", StringType(), False),
                                        StructField("state", StringType(), False),
                                        StructField("median_age", DoubleType(), False),
                                        StructField("male_population", IntegerType(), False),
                                        StructField("female_population", IntegerType(), False),
                                        StructField("total_polulation", IntegerType(), False),
                                        StructField("number_veterans", IntegerType(), False),
                                        StructField("foreign_born", IntegerType(), False),
                                        StructField("average_household_size", DoubleType(), False),
                                        StructField("state_code", StringType(), False),
                                        StructField("race", StringType(), False),
                                        StructField("quant", IntegerType(), False)
                                        ])
    
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(us_cities_demog_schema)\
            .load(us_cities_demographics_path)

In [187]:
def load_airport_codes(spark):
    airport_codes_schema = StructType([\
                                       StructField("ident", StringType(), False),
                                       StructField("type", StringType(), False),
                                       StructField("name", StringType(), False),
                                       StructField("elevation_ft", IntegerType(), False),
                                       StructField("continent", StringType(), False),
                                       StructField("iso_country", StringType(), False),
                                       StructField("iso_region", StringType(), False),
                                       StructField("municipality", StringType(), False),
                                       StructField("gps_code", StringType(), False),
                                       StructField("iata_code", StringType(), False),
                                       StructField("local_code", StringType(), False),
                                       StructField("coordinates", StringType(), False)
                                      ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',',')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(airport_codes_schema)\
            .load(airport_codes_csv_path)

In [193]:
def load_country(spark):
    country_schema = StructType([\
                                  StructField("code", IntegerType(), False),
                                  StructField("name", StringType(), False)
                                 ])
    
    return spark\
            .read\
            .format('com.databricks.spark.csv')\
            .option('sep',';')\
            .option('header', 'true')\
            .option("encoding", "utf-8")\
            .schema(country_schema)\
            .load(country_path)

In [197]:
def main():
    
    spark = create_spark_session()
    
    df_glob_temp = load_glob_temp(spark)
    df_glob_temp.show(truncate=False)
    df_glob_temp.printSchema()
    
    
    df_glob_temp_state = load_glob_temp_state(spark)
    df_glob_temp_state.show()
    df_glob_temp_state.printSchema()
    
    df_glob_temp_major_city = load_global_land_temp_major_city(spark)
    df_glob_temp_major_city.show(truncate = False)
    df_glob_temp_major_city.printSchema()
    
    df_glob_temp_country = load_global_land_temp_by_country(spark)
    df_glob_temp_country.show(truncate = False)
    df_glob_temp_country.printSchema()
    
    df_glob_temp_city = load_global_land_temp_by_city(spark)
    df_glob_temp_city.show(truncate=False)
    df_glob_temp_city.printSchema
    
    df_us_cities_demog = load_us_cities_demographics(spark)
    df_us_cities_demog.show(truncate=False)
    df_us_cities_demog.printSchema
    
    df_airport_codes_csv = load_airport_codes(spark)
    df_airport_codes_csv.show(truncate=False)
    df_airport_codes_csv.printSchema
    
    df_country = load_country(spark)
    df_country.show(truncate=False)
    df_country.printSchema

In [199]:
main()

Start the application
+----------+------------------------+------------------------------------+--------------------+--------------------------------+--------------------+--------------------------------+----------------------------------+----------------------------------------------+
|dt        |land_average_temperature|land_average_temperature_uncertainty|land_max_temperature|land_max_temperature_uncertainty|land_min_temperature|land_min_temperature_uncertainty|land_and_ocean_average_temperature|land_and_ocean_average_temperature_uncertainty|
+----------+------------------------+------------------------------------+--------------------+--------------------------------+--------------------+--------------------------------+----------------------------------+----------------------------------------------+
|1750-01-01|3.0340000000000003      |3.574                               |null                |null                            |null                |null                            |n

+----------+-------------------+-------------------------------+-------+
|dt        |average_temperature|average_temperature_uncertainty|country|
+----------+-------------------+-------------------------------+-------+
|1743-11-01|4.3839999999999995 |2.294                          |Åland  |
|1743-12-01|null               |null                           |Åland  |
|1744-01-01|null               |null                           |Åland  |
|1744-02-01|null               |null                           |Åland  |
|1744-03-01|null               |null                           |Åland  |
|1744-04-01|1.53               |4.68                           |Åland  |
|1744-05-01|6.702000000000001  |1.789                          |Åland  |
|1744-06-01|11.609000000000002 |1.577                          |Åland  |
|1744-07-01|15.342             |1.41                           |Åland  |
|1744-08-01|null               |null                           |Åland  |
|1744-09-01|11.702             |1.517              

+----+---------------+
|code|name           |
+----+---------------+
|582 |MEXICO         |
|101 |ALBANIA        |
|316 |ALGERIA        |
|236 |AFGHANISTAN    |
|102 |ANDORRA        |
|324 |ANGOLA         |
|529 |ANGUILLA       |
|518 |ANTIGUA-BARBUDA|
|687 |ARGENTINA      |
|151 |ARMENIA        |
|532 |ARUBA          |
|438 |AUSTRALIA      |
|103 |AUSTRIA        |
|152 |AZERBAIJAN     |
|512 |BAHAMAS        |
|298 |BAHRAIN        |
|274 |BANGLADESH     |
|513 |BARBADOS       |
|104 |BELGIUM        |
|581 |BELIZE         |
+----+---------------+
only showing top 20 rows

