# Clean the User Data 
1. Initialize and drop duplicates
2. Combine first and last names
3. Drop first_name and last_name
4. Convert date_joined to timestamp data type
5. Reorder the dataframe
6. Copy Cleaned Data to Global Temporary View

## Initialize notebook and drop duplicates

In [0]:
%run "./Initialize Cleaning Notebooks"

In [0]:
# Initalize using function
df_user_clean = init_cleaner("user")

## Combine first and last names

In [0]:
from pyspark.sql.functions import concat, lit
df_user_clean = df_user_clean.withColumn("user_name", concat("first_name", lit(" "), "last_name"))

## Drop columns first_name and last_name

In [0]:
df_user_clean = df_user_clean.drop("first_name", "last_name")

## Convert date_joined to timestamp data type

In [0]:
from pyspark.sql.functions import to_timestamp
df_user_clean = df_user_clean.withColumn("date_joined", to_timestamp("date_joined"))

## Reorder columns

In [0]:
df_user_clean = df_user_clean.select("ind", "age", "user_name", "date_joined")
df_user_clean.limit(5).display()

## Copy cleaned dataframe to global temporary table

In [0]:
# If "Batch" or "Stream" mode, use appropriate temp view
if (dbutils.widgets.get("mode") == "Batch"):
    df_user_clean.createOrReplaceGlobalTempView("gtv_129a67850695_user_clean")
elif(dbutils.widgets.get("mode") == "Stream"):
    df_user_clean.createOrReplaceGlobalTempView("gtv_129a67850695_stream_user_clean")
print("Global Temp View created for " + dbutils.widgets.get("mode") + " mode.")