## Initializations

In [3]:
# Needed to enable %%sparksql magic (Enables SQL instructions execution)
%load_ext sparksql_magic

In [4]:
# Imports
from pyspark.sql import SparkSession

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col,lag, lit, round, mean as _mean, sum as _sum, expr, create_map, to_date, month, dayofmonth, date_format, min as _min

In [5]:
# Create SparkSession
spark = SparkSession.builder.appName("Meteor Showers") \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
# Config
cBaseDataPath = "file:///C:/Users/manso/LocalDocuments/10-TechProjects/meteor-showers/data/"

## Create DB structures (according to medallion architecture)

### Bronze Layer

In [7]:
%%sparksql

-- Create dbMeteorShowers Bronze Layer database

-- DROP section (if needed)
--DROP DATABASE dbBMeteorShowers

-- Create database
CREATE DATABASE IF NOT EXISTS dbBMeteorShowers;

### Silver Layer

In [8]:
%%sparksql

-- Create dbMeteorShowers Silver Layer database

-- DROP section (if needed)
--DROP DATABASE dbSMeteorShowers

-- Create database
CREATE DATABASE IF NOT EXISTS dbSMeteorShowers;

### Gold Layer

In [9]:
%%sparksql

-- Create dbMeteorShowers Gold Layer database

-- DROP section (if needed)
--DROP DATABASE dbGMeteorShowers

-- Create database
CREATE DATABASE IF NOT EXISTS dbGMeteorShowers;

In [10]:
# List existing databases
spark.sql('show databases').show()

# List train database tables
tablesdbb = spark.catalog.listTables('dbbmeteorshowers')

print("Table list:")
for table in tablesdbb:
    print(table.name)

+----------------+
|       namespace|
+----------------+
|dbbmeteorshowers|
|dbgmeteorshowers|
|dbsmeteorshowers|
|         default|
+----------------+

Table list:
cities
constellations
meteorshowers
moonphases


# Exercise - Cleanse meteor data

In [11]:
# Import all four .csv files

rawLoadDataEntitiesDict = {
    "meteor_showers": "meteorshowers.csv",
    "moon_phases" : "moonphases.csv",
    "constellations" : "constellations.csv",
    "cities" : "cities.csv"
}

dfmeteor_showers = spark.read.options(inferSchema="True", header= "True").csv(cBaseDataPath + rawLoadDataEntitiesDict["meteor_showers"])
dfmoon_phases = spark.read.options(inferSchema="True", header= "True").csv(cBaseDataPath + rawLoadDataEntitiesDict["moon_phases"])
dfconstellations = spark.read.options(inferSchema="True", header= "True").csv(cBaseDataPath + rawLoadDataEntitiesDict["constellations"])
dfcities = spark.read.options(inferSchema="True", header= "True").csv(cBaseDataPath + rawLoadDataEntitiesDict["cities"])


In [12]:
# Test list comprehension
dictChange = {'name':'Chang\'e','radiant':'Draco','bestmonth':'october','startmonth':'october','startday':1,'endmonth':'october','endday':31,'hemisphere':'northern','preferredhemisphere':'northern'}
listChange = list(dictChange.keys())

strChange = [(dictChange[idx1]) for idx1 in listChange]

print(listChange)
print(dictChange)
print(strChange)


#programming_languages.index("Python")


['name', 'radiant', 'bestmonth', 'startmonth', 'startday', 'endmonth', 'endday', 'hemisphere', 'preferredhemisphere']
{'name': "Chang'e", 'radiant': 'Draco', 'bestmonth': 'october', 'startmonth': 'october', 'startday': 1, 'endmonth': 'october', 'endday': 31, 'hemisphere': 'northern', 'preferredhemisphere': 'northern'}
["Chang'e", 'Draco', 'october', 'october', 1, 'october', 31, 'northern', 'northern']


In [13]:
# Create new entry for meteor shower

change_meteor_shower = {'name':'Chang\'e','radiant':'Draco','bestmonth':'october','startmonth':'october','startday':1,'endmonth':'october','endday':31,'hemisphere':'northern','preferredhemisphere':'northern'}

# Test schema
dfschema = dfmeteor_showers.where("1==2").select(*[(col(x.name).cast(x.dataType)) for x in dfmeteor_showers.schema.fields])
dfschema.printSchema()

# Build target dataframe bringing 1 record from source dataframe
# Note: This is a strategy to pick schema metadata
dfAdd_meteor_shower = dfmeteor_showers.limit(1)

# Dynamically populates target dataframe with dictionary field names:values structure
# Note: Overwrites record brought from source dataframe 
for field in dfmeteor_showers.schema.fields:

    # Get field value from dictionary
    value = change_meteor_shower[field.name]
    dfAdd_meteor_shower = dfAdd_meteor_shower.withColumn(field.name,lit(change_meteor_shower[field.name]).cast(field.dataType))

dfmeteor_showers = dfmeteor_showers.unionAll(dfAdd_meteor_shower)

dfmeteor_showers.show()


root
 |-- name: string (nullable = true)
 |-- radiant: string (nullable = true)
 |-- bestmonth: string (nullable = true)
 |-- startmonth: string (nullable = true)
 |-- startday: integer (nullable = true)
 |-- endmonth: string (nullable = true)
 |-- endday: integer (nullable = true)
 |-- hemisphere: string (nullable = true)
 |-- preferredhemisphere: string (nullable = true)

+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+
|        name| radiant|bestmonth|startmonth|startday|endmonth|endday|        hemisphere|preferredhemisphere|
+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+
|      Lyrids|    Lyra|    april|     april|      21|   april|    22|          northern|           northern|
|Eta Aquarids|Aquarius|      may|     april|      19|     may|    28|northern, southern|           southern|
|    Orionids|   Orion|  october|   october|       2|november|     7|northern,

In [14]:
# Create an entry for the new Draco constellation:
dfconstellations.show()

draco_constellation = {'constellation':'Draco','bestmonth':'july','latitudestart':90,'latitudeend':-15,'besttime':2100,'hemisphere':'northern'}

# Test schema
dfschema = dfconstellations.where("1==2").select(*[(col(x.name).cast(x.dataType)) for x in dfconstellations.schema.fields])
dfschema.printSchema()

# Build target dataframe bringing 1 record from source dataframe
# Note: This is a strategy to pick schema metadata
dfAdd_constellations = dfconstellations.limit(1)

# Dynamically populates target dataframe with dictionary field names:values structure
# Note: Overwrites record brought from source dataframe 
for field in dfconstellations.schema.fields:

    # Get field value from dictionary
    value = draco_constellation[field.name]
    dfAdd_constellations = dfAdd_constellations.withColumn(field.name,lit(draco_constellation[field.name]).cast(field.dataType))

dfconstellations = dfconstellations.unionAll(dfAdd_constellations)

dfconstellations.show()



+-------------+---------+-------------+-----------+-------------------+----------+
|constellation|bestmonth|latitudestart|latitudeend|           besttime|hemisphere|
+-------------+---------+-------------+-----------+-------------------+----------+
|         Lyra|   august|           90|        -40|2024-02-21 21:00:00|  northern|
|     Aquarius|  october|           65|        -90|2024-02-21 21:00:00|  southern|
|        Orion|  january|           85|        -75|2024-02-21 21:00:00|  northern|
|      Perseus| december|           90|        -35|2024-02-21 21:00:00|  northern|
|          Leo|    april|           90|         65|2024-02-21 21:00:00|  northern|
+-------------+---------+-------------+-----------+-------------------+----------+

root
 |-- constellation: string (nullable = true)
 |-- bestmonth: string (nullable = true)
 |-- latitudestart: integer (nullable = true)
 |-- latitudeend: integer (nullable = true)
 |-- besttime: timestamp (nullable = true)
 |-- hemisphere: string (nul

## Explore data

In [15]:
dfmeteor_showers.toPandas().info()
dfmeteor_showers.show(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 6 non-null      object
 1   radiant              6 non-null      object
 2   bestmonth            6 non-null      object
 3   startmonth           6 non-null      object
 4   startday             6 non-null      int32 
 5   endmonth             6 non-null      object
 6   endday               6 non-null      int32 
 7   hemisphere           6 non-null      object
 8   preferredhemisphere  6 non-null      object
dtypes: int32(2), object(7)
memory usage: 516.0+ bytes
+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+
|        name| radiant|bestmonth|startmonth|startday|endmonth|endday|        hemisphere|preferredhemisphere|
+------------+--------+---------+----------+--------+--------+------+------------------+

In [16]:
#dfmoon_phases.toPandas().info()
dfmoon_phases.show(5)

+-------+---+-------------+------------+
|  month|day|    moonphase|specialevent|
+-------+---+-------------+------------+
|january|  1|         NULL|        NULL|
|january|  2|first quarter|        NULL|
|january|  3|         NULL|        NULL|
|january|  4|         NULL|        NULL|
|january|  5|         NULL|        NULL|
+-------+---+-------------+------------+
only showing top 5 rows



In [17]:
#dfconstellations.toPandas().info()
dfconstellations.show(5)

+-------------+---------+-------------+-----------+-------------------+----------+
|constellation|bestmonth|latitudestart|latitudeend|           besttime|hemisphere|
+-------------+---------+-------------+-----------+-------------------+----------+
|         Lyra|   august|           90|        -40|2024-02-21 21:00:00|  northern|
|     Aquarius|  october|           65|        -90|2024-02-21 21:00:00|  southern|
|        Orion|  january|           85|        -75|2024-02-21 21:00:00|  northern|
|      Perseus| december|           90|        -35|2024-02-21 21:00:00|  northern|
|          Leo|    april|           90|         65|2024-02-21 21:00:00|  northern|
+-------------+---------+-------------+-----------+-------------------+----------+
only showing top 5 rows



In [18]:
dfcities.toPandas().info()
dfcities.show(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   city      256 non-null    object 
 1   latitude  256 non-null    float64
 2   country   256 non-null    object 
dtypes: float64(1), object(2)
memory usage: 6.1+ KB
+----------------+--------+--------------------+
|            city|latitude|             country|
+----------------+--------+--------------------+
|       Abu Dhabi|   24.47|United Arab Emirates|
|           Abuja|    9.07|             Nigeria|
|           Accra|    5.55|               Ghana|
|       Adamstown|  -25.07|    Pitcairn Islands|
|     Addis Ababa|    9.02|            Ethiopia|
|         Algiers|   36.77|             Algeria|
|           Alofi|  -19.07|                Niue|
|           Amman|   31.93|              Jordan|
|       Amsterdam|   52.37|         Netherlands|
|Andorra la Vella|    42.5|             Andorra|
+-----------

 ## Load Raw data to Bronze Layer

In [19]:
# Write to (managed) tables on dbMeteorShowers database

dfmeteor_showers.write.mode("Overwrite")\
    .saveAsTable("dbBMeteorShowers.MeteorShowers")

dfmoon_phases.write.mode("Overwrite")\
    .saveAsTable("dbBMeteorShowers.MoonPhases")

dfconstellations.write.mode("Overwrite")\
    .saveAsTable("dbBMeteorShowers.Constellations")

dfcities.write.mode("Overwrite")\
    .saveAsTable("dbBMeteorShowers.Cities")


### Explore Bronze layer information

In [20]:
%%sparksql

-- Explore Bronze layer information

select * 
from dbBMeteorShowers.Cities
LIMIT 5
;


0,1,2
city,latitude,country
Abu Dhabi,24.47,United Arab Emirates
Abuja,9.07,Nigeria
Accra,5.55,Ghana
Adamstown,-25.07,Pitcairn Islands
Addis Ababa,9.02,Ethiopia


## Transform data

In [21]:
# Configure value transformation dictionary structures

trfMonths = {'january':1, 'february':2, 'march':3, 'april':4, 'may':5, 'june':6, 'july':7, 'august':8, 'september':9, 'october':10, 'november':11, 'december':12}
trfHemispheres = {'northern':0, 'southern':1, 'northern, southern':3}
trfPhases = {'new moon':0,'third quarter':0.5, 'first quarter':0.5,'full moon':1.0}

In [22]:
# Convert the month columns to numbers:

# Create map structure for months (check List Comprehension on Python)
mapMonths_col = create_map([lit(x) for i in trfMonths.items() for x in i])

dfmeteor_showers = dfmeteor_showers.withColumn("bestmonthnum", mapMonths_col[col('bestmonth')]) \
    .withColumn("startmonthnum", mapMonths_col[col('startmonth')]) \
    .withColumn("endmonthnum", mapMonths_col[col('endmonth')])

#dfmeteor_showers.show()


dfmoon_phases = dfmoon_phases.withColumn("monthnum", mapMonths_col[col('month')])

#dfmoon_phases.show()


dfconstellations = dfconstellations.withColumn("bestmonthnum", mapMonths_col[col('bestmonth')])

#dfconstellations.show()


In [23]:
dfmeteor_showers.toPandas().info()
dfmeteor_showers.show(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 6 non-null      object
 1   radiant              6 non-null      object
 2   bestmonth            6 non-null      object
 3   startmonth           6 non-null      object
 4   startday             6 non-null      int32 
 5   endmonth             6 non-null      object
 6   endday               6 non-null      int32 
 7   hemisphere           6 non-null      object
 8   preferredhemisphere  6 non-null      object
 9   bestmonthnum         6 non-null      int32 
 10  startmonthnum        6 non-null      int32 
 11  endmonthnum          6 non-null      int32 
dtypes: int32(5), object(7)
memory usage: 588.0+ bytes
+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+------------+-------------+-----------+
|        n

In [24]:
# Create two new columns: startdate and enddate. These columns will contain a month and day in 2020:

dfmeteor_showers = dfmeteor_showers.withColumn("startdate", to_date(lit(2020)*lit(10000)+col("startmonthnum")*lit(100)+col("startday"), "yyyyMMdd")) \
                .withColumn("enddate", to_date(lit(2020)*lit(10000)+col("endmonthnum")*lit(100)+col("endday"), "yyyyMMdd")) 

# ... Or with use of SQL in expr() function:
#dfmeteor_showers = dfmeteor_showers.withColumn("startdate", expr("to_date(2020*10000+startmonthnum*100+startday,'yyyyMMdd')"))

dfmeteor_showers.printSchema()
dfmeteor_showers.show(5)


root
 |-- name: string (nullable = true)
 |-- radiant: string (nullable = true)
 |-- bestmonth: string (nullable = true)
 |-- startmonth: string (nullable = true)
 |-- startday: integer (nullable = true)
 |-- endmonth: string (nullable = true)
 |-- endday: integer (nullable = true)
 |-- hemisphere: string (nullable = true)
 |-- preferredhemisphere: string (nullable = true)
 |-- bestmonthnum: integer (nullable = true)
 |-- startmonthnum: integer (nullable = true)
 |-- endmonthnum: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)

+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+------------+-------------+-----------+----------+----------+
|        name| radiant|bestmonth|startmonth|startday|endmonth|endday|        hemisphere|preferredhemisphere|bestmonthnum|startmonthnum|endmonthnum| startdate|   enddate|
+------------+--------+---------+----------+--------+--------+------+---

In [25]:
# Follow the same pattern for moon_phases:

dfmoon_phases = dfmoon_phases.withColumn("date", to_date(lit(2020)*lit(10000)+col("monthnum")*lit(100)+col("day"), "yyyyMMdd"))

dfmoon_phases.printSchema()
dfmoon_phases.show()


root
 |-- month: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- moonphase: string (nullable = true)
 |-- specialevent: string (nullable = true)
 |-- monthnum: integer (nullable = true)
 |-- date: date (nullable = true)

+-------+---+-------------+------------+--------+----------+
|  month|day|    moonphase|specialevent|monthnum|      date|
+-------+---+-------------+------------+--------+----------+
|january|  1|         NULL|        NULL|       1|2020-01-01|
|january|  2|first quarter|        NULL|       1|2020-01-02|
|january|  3|         NULL|        NULL|       1|2020-01-03|
|january|  4|         NULL|        NULL|       1|2020-01-04|
|january|  5|         NULL|        NULL|       1|2020-01-05|
|january|  6|         NULL|        NULL|       1|2020-01-06|
|january|  7|         NULL|        NULL|       1|2020-01-07|
|january|  8|         NULL|        NULL|       1|2020-01-08|
|january|  9|         NULL|        NULL|       1|2020-01-09|
|january| 10|    full moon|  

In [26]:
# Convert hemisphere data to numbers by using the mapping process:

# Create map structure for Hemispheres (check List Comprehension on Python)
mapHemispheres_col = create_map([lit(x) for i in trfHemispheres.items() for x in i])

dfmeteor_showers = dfmeteor_showers.withColumn("hemispherenum", mapHemispheres_col[col('hemisphere')])

dfconstellations = dfconstellations.withColumn("hemispherenum", mapHemispheres_col[col('hemisphere')])

dfmeteor_showers.show(5)
dfconstellations.show(5)


""" hemispheres = {'northern':0, 'southern':1, 'northern, southern':3}
meteor_showers.hemisphere = meteor_showers.hemisphere.map(hemispheres)
constellations.hemisphere = constellations.hemisphere.map(hemispheres) """

+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|        name| radiant|bestmonth|startmonth|startday|endmonth|endday|        hemisphere|preferredhemisphere|bestmonthnum|startmonthnum|endmonthnum| startdate|   enddate|hemispherenum|
+------------+--------+---------+----------+--------+--------+------+------------------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|      Lyrids|    Lyra|    april|     april|      21|   april|    22|          northern|           northern|           4|            4|          4|2020-04-21|2020-04-22|            0|
|Eta Aquarids|Aquarius|      may|     april|      19|     may|    28|northern, southern|           southern|           5|            4|          5|2020-04-19|2020-05-28|            3|
|    Orionids|   Orion|  october|   october|       2|november|     7|northern, s

" hemispheres = {'northern':0, 'southern':1, 'northern, southern':3}\nmeteor_showers.hemisphere = meteor_showers.hemisphere.map(hemispheres)\nconstellations.hemisphere = constellations.hemisphere.map(hemispheres) "

In [27]:
# Convert Moon phases to numbers that represent the percentage of the Moon that's visible

# Create map structure for Moon phases (check List Comprehension on Python)
mapPhases_col = create_map([lit(x) for i in trfPhases.items() for x in i])

dfmoon_phases = dfmoon_phases.withColumn("percentage", mapPhases_col[col('moonphase')])

dfmoon_phases.show()


+-------+---+-------------+------------+--------+----------+----------+
|  month|day|    moonphase|specialevent|monthnum|      date|percentage|
+-------+---+-------------+------------+--------+----------+----------+
|january|  1|         NULL|        NULL|       1|2020-01-01|      NULL|
|january|  2|first quarter|        NULL|       1|2020-01-02|       0.5|
|january|  3|         NULL|        NULL|       1|2020-01-03|      NULL|
|january|  4|         NULL|        NULL|       1|2020-01-04|      NULL|
|january|  5|         NULL|        NULL|       1|2020-01-05|      NULL|
|january|  6|         NULL|        NULL|       1|2020-01-06|      NULL|
|january|  7|         NULL|        NULL|       1|2020-01-07|      NULL|
|january|  8|         NULL|        NULL|       1|2020-01-08|      NULL|
|january|  9|         NULL|        NULL|       1|2020-01-09|      NULL|
|january| 10|    full moon|        NULL|       1|2020-01-10|       1.0|
|january| 11|         NULL|        NULL|       1|2020-01-11|    

In [28]:
# Remove unnecessary data
dfmeteor_showers = dfmeteor_showers.drop("startmonth", "startday", "endmonth", "endday", "hemisphere")
dfmoon_phases = dfmoon_phases.drop("month","day","moonphase","specialevent")
dfconstellations = dfconstellations.drop("besttime")

dfmeteor_showers.show(5)
dfmoon_phases.show(5)
dfconstellations.show(5)

+------------+--------+---------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|        name| radiant|bestmonth|preferredhemisphere|bestmonthnum|startmonthnum|endmonthnum| startdate|   enddate|hemispherenum|
+------------+--------+---------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|      Lyrids|    Lyra|    april|           northern|           4|            4|          4|2020-04-21|2020-04-22|            0|
|Eta Aquarids|Aquarius|      may|           southern|           5|            4|          5|2020-04-19|2020-05-28|            3|
|    Orionids|   Orion|  october| northern, southern|          10|           10|         11|2020-10-02|2020-11-07|            3|
|    Perseids| Perseus|   august|           northern|           8|            7|          8|2020-07-14|2020-08-24|            0|
|     Leonids|     Leo| november| northern, southern|          11|           11|         11|2020-

In [29]:
# Figuring out a more accurate percentage for moon_phases:

# 1.Create a variable to save the last phase that you saw.
# 2.Loop through each row and column in the moon_phases DataFrame.
# 3.If the value in the percentage column of a row is NaN (null), then replace it with the last phase that you saw.
# 4.If the value isn't NaN, then save the value as the last phase that you saw.


#dfmoon_phases.show(5)

# Generate ID integer key column from date
dfmoon_phases = dfmoon_phases.withColumn("id",2020*10000+month("date")*100+dayofmonth("date")) 

dfmoon_phases.createOrReplaceTempView("stage_moon_phases")
dfmoon_phases = spark.sql( \
    " With univ as ( \
    select \
        id as id_Aux, \
        LAG(id, 1,0) OVER (ORDER BY id) AS id_From, \
        (id) as id_TO, \
        LAG(percentage, 1,0) OVER (ORDER BY id) AS PreviousPct \
    from stage_moon_phases \
    where ifnull(percentage,-1) >= 0 \
    and  1=1)\
    Select \
        id_Aux, \
        CASE \
            WHEN id_From = 0 then (to_date(id_TO,'yyyyMMdd') - INTERVAL 1 day)  \
            ELSE to_date(id_From,'yyyyMMdd') \
        END as id_From, \
        (to_date(id_TO,'yyyyMMdd') - INTERVAL 1 day) as id_TO, \
        PreviousPct as percentage \
    from univ" \
     )

dfmoon_phases.where(col("id_Aux") >= "20200116" ) \
    .show()

dfmoon_phases = dfmoon_phases.withColumn('day_seq', expr("sequence(id_From, id_TO, interval 1 day)")) \
    .withColumn("date",expr("explode(day_seq)")) \
    .withColumn("DateID",date_format(col("date"),"yyyyMMdd").cast(IntegerType())) \
    .withColumn("monthnum",month(col("date"))) \
    .orderBy("DateID") \
    .drop("day_seq","id_Aux","id_From","id_TO")

dfmoon_phases.printSchema()
dfmoon_phases.show()


+--------+----------+----------+----------+
|  id_Aux|   id_From|     id_TO|percentage|
+--------+----------+----------+----------+
|20200117|2020-01-10|2020-01-16|       1.0|
|20200124|2020-01-17|2020-01-23|       0.5|
|20200201|2020-01-24|2020-01-31|       0.0|
|20200208|2020-02-01|2020-02-07|       0.5|
|20200215|2020-02-08|2020-02-14|       1.0|
|20200223|2020-02-15|2020-02-22|       0.5|
|20200302|2020-02-23|2020-03-01|       0.0|
|20200309|2020-03-02|2020-03-08|       0.5|
|20200316|2020-03-09|2020-03-15|       1.0|
|20200324|2020-03-16|2020-03-23|       0.5|
|20200401|2020-03-24|2020-03-31|       0.0|
|20200407|2020-04-01|2020-04-06|       0.5|
|20200414|2020-04-07|2020-04-13|       1.0|
|20200422|2020-04-14|2020-04-21|       0.5|
|20200430|2020-04-22|2020-04-29|       0.0|
|20200507|2020-04-30|2020-05-06|       0.5|
|20200514|2020-05-07|2020-05-13|       1.0|
|20200522|2020-05-14|2020-05-21|       0.5|
|20200529|2020-05-22|2020-05-28|       0.0|
|20200605|2020-05-29|2020-06-04|

 ## Load Tranformed data to Silver Layer

In [30]:
# Write to (managed) tables on dbMeteorShowers database

dfmoon_phases.write.mode("Overwrite")\
    .saveAsTable("dbsMeteorShowers.MoonPhases")


### Explore Silver layer information

In [31]:
%%sparksql

-- Explore Silver layer information

select * 
from dbsMeteorShowers.MoonPhases
where DateID >= 20200116
LIMIT 20
;


0,1,2,3
percentage,date,DateID,monthnum
1.0,2020-01-16,20200116,1
0.5,2020-01-17,20200117,1
0.5,2020-01-18,20200118,1
0.5,2020-01-19,20200119,1
0.5,2020-01-20,20200120,1
0.5,2020-01-21,20200121,1
0.5,2020-01-22,20200122,1
0.5,2020-01-23,20200123,1
0.0,2020-01-24,20200124,1


# Exercise - Write a predictor function - Part 1

### Review our four datasets

In [32]:
dfmeteor_showers.printSchema()
dfmeteor_showers.show(5)

root
 |-- name: string (nullable = true)
 |-- radiant: string (nullable = true)
 |-- bestmonth: string (nullable = true)
 |-- preferredhemisphere: string (nullable = true)
 |-- bestmonthnum: integer (nullable = true)
 |-- startmonthnum: integer (nullable = true)
 |-- endmonthnum: integer (nullable = true)
 |-- startdate: date (nullable = true)
 |-- enddate: date (nullable = true)
 |-- hemispherenum: integer (nullable = true)

+------------+--------+---------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|        name| radiant|bestmonth|preferredhemisphere|bestmonthnum|startmonthnum|endmonthnum| startdate|   enddate|hemispherenum|
+------------+--------+---------+-------------------+------------+-------------+-----------+----------+----------+-------------+
|      Lyrids|    Lyra|    april|           northern|           4|            4|          4|2020-04-21|2020-04-22|            0|
|Eta Aquarids|Aquarius|      may|           southern| 

In [33]:
dfmoon_phases.printSchema()

root
 |-- percentage: double (nullable = true)
 |-- date: date (nullable = false)
 |-- DateID: integer (nullable = true)
 |-- monthnum: integer (nullable = false)



In [34]:
dfcities.printSchema()

root
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- country: string (nullable = true)



In [35]:
dfconstellations.printSchema()

root
 |-- constellation: string (nullable = true)
 |-- bestmonth: string (nullable = true)
 |-- latitudestart: integer (nullable = true)
 |-- latitudeend: integer (nullable = true)
 |-- hemisphere: string (nullable = true)
 |-- bestmonthnum: integer (nullable = true)
 |-- hemispherenum: integer (nullable = true)



## Determine the latitude

In [36]:
# Create a function called predict_best_meteor_shower_viewing that takes in a city as a parameter:

def predict_best_meteor_shower_viewing(city):
    # Get the latitude of the city from the cities DataFrame
    latitude = dfcities.select("latitude").where(col("city") == city).collect()[0][0]

    return latitude


In [37]:
# Call the function

print(predict_best_meteor_shower_viewing('Abu Dhabi'))

24.47


# Exercise - Write a predictor function - Part 2

In [38]:
# Use latitude to determine constellation

def predict_best_meteor_shower_viewing(city):
    # Get the latitude of the city from the cities DataFrame
    latitude = lit(dfcities.select("latitude").where(col("city") == city).collect()[0][0])
    
    # Get the list of constellations that are viewable from that latitude
    constellation_list = dfconstellations.select("constellation") \
        .where((col("latitudestart") >= round(latitude,0)) & (col("latitudeend") < round(latitude,0)))\
        .collect()
    
    return constellation_list


In [39]:
# Print the constellation list
print(predict_best_meteor_shower_viewing('Abu Dhabi'))

[Row(constellation='Lyra'), Row(constellation='Aquarius'), Row(constellation='Orion'), Row(constellation='Perseus'), Row(constellation='Draco')]


In [40]:
# Create an output string

def predict_best_meteor_shower_viewing(city):

    # Create an empty string to return the message back to the user
    meteor_shower_string = ""

    cities = dfcities.where(col("city") == lit(city)).collect()

    if len(cities) == 0:
        meteor_shower_string = "Unfortunately, " + city + " isn't available for a prediction at this time."
        return meteor_shower_string

    # Get the latitude of the city from the cities DataFrame
    latitude = lit(dfcities.select("latitude").where(col("city") == city).collect()[0][0])
    
    # Get the list of constellations that are viewable from that latitude
    constellation_list = dfconstellations.select("constellation") \
        .where((col("latitudestart") >= round(latitude,0)) & (col("latitudeend") < round(latitude,0)))\
        .collect()
    
    # If no constellations are viewable, let the user know
    if not constellation_list:
        meteor_shower_string = "Unfortunately, there are no meteor showers viewable from "+ city + "."

        return meteor_shower_string


In [41]:
print(predict_best_meteor_shower_viewing('San Diego'))

print(predict_best_meteor_shower_viewing('Abu Dhabi'))

Unfortunately, San Diego isn't available for a prediction at this time.
None


In [42]:
tstConstellation = "Aquarius"

dfTstMeteor_Shower = dfmeteor_showers.select("name","startdate","enddate").where(col("radiant") == lit(tstConstellation))

#dfTstMeteor_Shower.show()

tstmeteor_shower = lit(dfTstMeteor_Shower.select("name").collect()[0].name)
tstmeteor_shower_startdate = lit(dfTstMeteor_Shower.select("startdate").collect()[0].startdate)
tstmeteor_shower_enddate = lit(dfTstMeteor_Shower.select("enddate").collect()[0].enddate)

""" print(tstmeteor_shower[0].name)
print(tstmeteor_shower_startdate[0].startdate)
print(tstmeteor_shower_enddate[0].startdate) """


dftest = dfmoon_phases.groupBy("percentage").agg(_min("date")).withColumnRenamed("min(date)", "date")
dftest.show()

dftest2 = dfmoon_phases.select("date", "percentage")  \
            .where((col("date") >= lit(tstmeteor_shower_startdate)) & (col("date") < lit(tstmeteor_shower_enddate)))
dftest2.show()


dftest3 = dftest2.groupBy("percentage").agg(_min("date")).withColumnRenamed("min(date)", "date")  \
            .where((col("date") >= lit(tstmeteor_shower_startdate)) & (col("date") < lit(tstmeteor_shower_enddate)))
dftest3.show()


# Find the first date where the Moon is the least visible
tstbest_moon_date = lit(dftest3.select("date").collect()[0][0])

print(tstbest_moon_date)


+----------+----------+
|percentage|      date|
+----------+----------+
|       0.0|2020-01-01|
|       0.5|2020-01-02|
|       1.0|2020-01-10|
+----------+----------+

+----------+----------+
|      date|percentage|
+----------+----------+
|2020-04-19|       0.5|
|2020-04-20|       0.5|
|2020-04-21|       0.5|
|2020-04-22|       0.0|
|2020-04-23|       0.0|
|2020-04-24|       0.0|
|2020-04-25|       0.0|
|2020-04-26|       0.0|
|2020-04-27|       0.0|
|2020-04-28|       0.0|
|2020-04-29|       0.0|
|2020-04-30|       0.5|
|2020-05-01|       0.5|
|2020-05-02|       0.5|
|2020-05-03|       0.5|
|2020-05-04|       0.5|
|2020-05-05|       0.5|
|2020-05-06|       0.5|
|2020-05-07|       1.0|
|2020-05-08|       1.0|
+----------+----------+
only showing top 20 rows

+----------+----------+
|percentage|      date|
+----------+----------+
|       0.5|2020-04-19|
|       0.0|2020-04-22|
|       1.0|2020-05-07|
+----------+----------+

Column<'DATE '2020-04-19''>


In [43]:
# Final code

def predict_best_meteor_shower_viewing(city):

# Create an empty string to return the message back to the user

    meteor_shower_string = ""

    cities = dfcities.where(col("city") == lit(city)).collect()

    if len(cities) == 0:
        meteor_shower_string = "Unfortunately, " + city + " isn't available for a prediction at this time. \n"
        return meteor_shower_string

    # Get the latitude of the city from the cities DataFrame
    latitude = lit(dfcities.select("latitude").where(col("city") == city).collect()[0][0])

    # Get the list of constellations that are viewable from that latitude
    constellation_list = dfconstellations.select("constellation") \
        .where((col("latitudestart") >= round(latitude,0)) & (col("latitudeend") < round(latitude,0)))\
        .collect()

    # If no constellations are viewable, let the user know
    if not constellation_list:
        meteor_shower_string = "Unfortunately, there are no meteor showers viewable from "+ city + "."

        return meteor_shower_string

    meteor_shower_string = "In " + city + " you can see the following meteor showers:\n"

     # Iterate through each constellation that is viewable from the city
    for constellation in constellation_list:
        
        # Find the meteor shower that is nearest to that constellation and its related start and end dates
        dfAuxMeteor_Shower = dfmeteor_showers.select("name","startdate","enddate").where(col("radiant") == constellation['constellation'])

        meteor_shower = dfAuxMeteor_Shower.select("name").collect()[0].name
        meteor_shower_startdate = dfAuxMeteor_Shower.select("startdate").collect()[0].startdate
        meteor_shower_enddate = dfAuxMeteor_Shower.select("enddate").collect()[0].enddate

        # Find the Moon phases for each date within the viewable time frame of that meteor shower
        dfAuxMoon_phases_1 = dfmoon_phases.select("date", "percentage")  \
            .where((col("date") >= lit(meteor_shower_startdate)) & (col("date") <= lit(meteor_shower_enddate)))
        
        dfAuxMoon_phases_2 = dfAuxMoon_phases_1.select("percentage",expr("date_format(date,'dd MMMM yyyy')")) \
            .orderBy("percentage") \
            .withColumnRenamed("date_format(date, dd MMMM yyyy)","dateStr") \
            .first()

        # Find the first date where the Moon is the least visible (conerts the date output to formatted string)
        best_moon_date = dfAuxMoon_phases_2.dateStr
        
    
        # Add that date to the string to report back to the user
        meteor_shower_string += "\t" + meteor_shower + " is best seen if you look towards the " + \
                                constellation[0] + " constellation on " +  \
                                best_moon_date + "\n"

    return meteor_shower_string



In [44]:
print(predict_best_meteor_shower_viewing('Abu Dhabi'))
print(predict_best_meteor_shower_viewing('San Diego'))
print(predict_best_meteor_shower_viewing('Amsterdam'))

In Abu Dhabi you can see the following meteor showers:
	Lyrids is best seen if you look towards the Lyra constellation on 22 April 2020
	Eta Aquarids is best seen if you look towards the Aquarius constellation on 22 April 2020
	Orionids is best seen if you look towards the Orion constellation on 16 October 2020
	Perseids is best seen if you look towards the Perseus constellation on 20 July 2020
	Chang'e is best seen if you look towards the Draco constellation on 16 October 2020

Unfortunately, San Diego isn't available for a prediction at this time. 

In Amsterdam you can see the following meteor showers:
	Lyrids is best seen if you look towards the Lyra constellation on 22 April 2020
	Eta Aquarids is best seen if you look towards the Aquarius constellation on 22 April 2020
	Orionids is best seen if you look towards the Orion constellation on 16 October 2020
	Perseids is best seen if you look towards the Perseus constellation on 20 July 2020
	Chang'e is best seen if you look towards th

### Test Area

In [45]:
%%sparksql

select * 
from dbbMeteorShowers.MeteorShowers
where radiant = 'Lyra'
LIMIT 20
;


0,1,2,3,4,5,6,7,8
name,radiant,bestmonth,startmonth,startday,endmonth,endday,hemisphere,preferredhemisphere
Lyrids,Lyra,april,april,21,april,22,northern,northern


In [46]:
%%sparksql

select * 
from dbBMeteorShowers.MoonPhases
where month = 'april'
and day >=15
order by day
--LIMIT 20
;

0,1,2,3
month,day,moonphase,specialevent
april,15,,
april,16,,
april,17,,
april,18,,
april,19,,
april,20,,
april,21,,
april,22,new moon,
april,23,,


In [47]:
%%sparksql

select * 
from dbSMeteorShowers.MoonPhases
where dateID >= 20200415 and dateID < 20200430
order by DateID 
--LIMIT 20
;


0,1,2,3
percentage,date,DateID,monthnum
0.5,2020-04-15,20200415,4
0.5,2020-04-16,20200416,4
0.5,2020-04-17,20200417,4
0.5,2020-04-18,20200418,4
0.5,2020-04-19,20200419,4
0.5,2020-04-20,20200420,4
0.5,2020-04-21,20200421,4
0.0,2020-04-22,20200422,4
0.0,2020-04-23,20200423,4


# Exercise - Add data from the Chang'e story

In [48]:
# Final code

def predict_best_meteor_shower_viewing(city):

# Create an empty string to return the message back to the user

    meteor_shower_string = ""

    cities = dfcities.where(col("city") == lit(city)).collect()

    if len(cities) == 0:
        meteor_shower_string = "Unfortunately, " + city + " isn't available for a prediction at this time. \n"
        return meteor_shower_string

    # Get the latitude of the city from the cities DataFrame
    latitude = lit(dfcities.select("latitude").where(col("city") == city).collect()[0][0])

    # Get the list of constellations that are viewable from that latitude
    constellation_list = dfconstellations.select("constellation") \
        .where((col("latitudestart") >= round(latitude,0)) & (col("latitudeend") < round(latitude,0)))\
        .collect()

    # If no constellations are viewable, let the user know
    if not constellation_list:
        meteor_shower_string = "Unfortunately, there are no meteor showers viewable from "+ city + "."

        return meteor_shower_string

    meteor_shower_string = "In " + city + " you can see the following meteor showers:\n"

     # Iterate through each constellation that is viewable from the city
    for constellation in constellation_list:
        
        # Find the meteor shower that is nearest to that constellation and its related start and end dates
        dfAuxMeteor_Shower = dfmeteor_showers.select("name","startdate","enddate").where(col("radiant") == constellation['constellation'])

        meteor_shower = dfAuxMeteor_Shower.select("name").collect()[0].name
        meteor_shower_startdate = dfAuxMeteor_Shower.select("startdate").collect()[0].startdate
        meteor_shower_enddate = dfAuxMeteor_Shower.select("enddate").collect()[0].enddate

        # Find the Moon phases for each date within the viewable time frame of that meteor shower
        dfAuxMoon_phases_1 = dfmoon_phases.select("date", "percentage")  \
            .where((col("date") >= lit(meteor_shower_startdate)) & (col("date") <= lit(meteor_shower_enddate)))
        
        # Added/Changed block specific for this exercise
        if meteor_shower == 'Chang\'e':
            
            # For the film meteor shower, find the date where the Moon is the most visible
            dfAuxMoon_phases_2 = dfAuxMoon_phases_1.select("percentage",expr("date_format(date,'dd MMMM yyyy')")) \
            .orderBy(col("percentage").desc()) \
            .withColumnRenamed("date_format(date, dd MMMM yyyy)","dateStr") \
            .first()

            # Find the first date where the Moon is the most visible (converts the date output to formatted string)
            best_moon_date = dfAuxMoon_phases_2.dateStr

            # Add that date to the string to report back to the user
            meteor_shower_string += "\t" + "Though the Moon will be bright, " + meteor_shower + "'s meteor shower is best seen if you look towards the " \
                                    + constellation[0] + " constellation on " +  \
                                    best_moon_date + "\n"
        
        else:
            dfAuxMoon_phases_2 = dfAuxMoon_phases_1.select("percentage",expr("date_format(date,'dd MMMM yyyy')")) \
            .orderBy(col("percentage")) \
            .withColumnRenamed("date_format(date, dd MMMM yyyy)","dateStr") \
            .first()

            # Find the first date where the Moon is the least visible (conerts the date output to formatted string)
            best_moon_date = dfAuxMoon_phases_2.dateStr

            # Add that date to the string to report back to the user
            meteor_shower_string += "\t" + meteor_shower + " is best seen if you look towards the " + \
                                constellation[0] + " constellation on " +  \
                                best_moon_date + "\n"
        #################################
    
    return meteor_shower_string


In [49]:
 # Change the city to Beijing:

tstCities = ["Abu Dhabi","Beijing"]

for tstCity in tstCities:
    print(f"Predicitons for {tstCity}:\n")
    print(predict_best_meteor_shower_viewing(tstCity))

#print(predict_best_meteor_shower_viewing('Abu Dhabi'))
#print(predict_best_meteor_shower_viewing('Beijing'))



Predicitons for Abu Dhabi:

In Abu Dhabi you can see the following meteor showers:
	Lyrids is best seen if you look towards the Lyra constellation on 22 April 2020
	Eta Aquarids is best seen if you look towards the Aquarius constellation on 22 April 2020
	Orionids is best seen if you look towards the Orion constellation on 16 October 2020
	Perseids is best seen if you look towards the Perseus constellation on 20 July 2020
	Though the Moon will be bright, Chang'e's meteor shower is best seen if you look towards the Draco constellation on 01 October 2020

Predicitons for Beijing:

In Beijing you can see the following meteor showers:
	Lyrids is best seen if you look towards the Lyra constellation on 22 April 2020
	Eta Aquarids is best seen if you look towards the Aquarius constellation on 22 April 2020
	Orionids is best seen if you look towards the Orion constellation on 16 October 2020
	Perseids is best seen if you look towards the Perseus constellation on 20 July 2020
	Though the Moon w