# Clean the Geo Data 
1. Initialize and drop duplicates
2. Create new coordinates column based on latitiude and longtitude
3. Drop latitude and longtitude
5. Convert timestamp to timestamp data type
6. Reorder the dataframe
7. Copy Cleaned Data to Global Temporary View

## Initialize notebook and drop duplicates

In [0]:
%run "./Initialize Cleaning Notebooks"

In [0]:
# Initalize using function
df_geo_clean = init_cleaner("geo")

## Create array column coordinates using latitude and longtitude

In [0]:
from pyspark.sql.functions import array
df_geo_clean = df_geo_clean.withColumn("coordinates", array("latitude", "longitude"))

## Drop columns latitude and longitude

In [0]:
df_geo_clean = df_geo_clean.drop("latitude", "longitude")

## Convert timestamp to timestamp data type

In [0]:
from pyspark.sql.functions import to_timestamp
df_geo_clean = df_geo_clean.withColumn("timestamp", to_timestamp("timestamp"))

## Reorder columns

In [0]:
df_geo_clean = df_geo_clean.select("ind", "country", "coordinates", "timestamp")
df_geo_clean.limit(5).display()

## Copy cleaned dataframe to global temporary table

In [0]:
# If "Batch" or "Stream" mode, use appropriate temp view
if (dbutils.widgets.get("mode") == "Batch"):
    df_geo_clean.createOrReplaceGlobalTempView("gtv_129a67850695_geo_clean")
elif(dbutils.widgets.get("mode") == "Stream"):
    df_geo_clean.createOrReplaceGlobalTempView("gtv_129a67850695_stream_geo_clean")
print("Global Temp View created for " + dbutils.widgets.get("mode") + " mode.")