# Olist E-Commerce Data Cleaning

- Load data from ADLS\Storage\Container\Bronze

- Load data from Mongo DB

- Perform Data Cleaning 

- Write cleaned data into ADLS\Storage\Container\Silver

In [0]:
spark

## Connecting to Azure Data Lake Storage

In [0]:
storage_account = "storageaccountolistecom"
application_id = "10a55e8d-b4d6-4dd2-b74f-ee734d146ac0"
directory_id = "0a1ff938-feb1-4e8d-95b9-e2f699407226"
client_secret = "6yj8Q~Cauzb4TybflRb4Kvb-unQZeRM12oM8Aai."

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", application_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{directory_id}/oauth2/token")

## Loading datasets from Azure Storage Account

In [0]:
datasets = {'customer_df':'olist_customers_dataset',
            'geolocation_df':'olist_geolocation_dataset',
            'order_item_df':'olist_order_items_dataset',
            'order_payment_df':'olist_order_payments_dataset',
            'order_review_df':'olist_order_reviews_dataset',
            'order_df':'olist_orders_dataset',
            'product_df':'olist_products_dataset',
            'seller_df':'olist_sellers_dataset'}

In [0]:
for key, value in datasets.items():
    df = spark.read\
        .format('csv')\
        .option('header', True)\
        .option('inferSchema', True)\
        .load(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Bronze/{value}.csv")
    globals()[key] = df

## Loading dataset from Mongo DB

In [0]:
from pymongo import MongoClient
import pandas as pd

In [0]:
hostname = "zzw9d7.h.filess.io"
database = "OlistEcommerceNoSQL_thanhuntso"
port = "61004"
username = "OlistEcommerceNoSQL_thanhuntso"
password = "66bed16cddc5355a50a358e4fa78f52e1d84ecf8"

uri = "mongodb://" + username + ":" + password + "@" + hostname + ":" + port + "/" + database

# Connect with the portnumber and host
client = MongoClient(uri)

# Access database
mydatabase = client[database]
mydatabase

Database(MongoClient(host=['zzw9d7.h.filess.io:61004'], document_class=dict, tz_aware=False, connect=True), 'OlistEcommerceNoSQL_thanhuntso')

In [0]:
collection = mydatabase["product_category_name_translation"]

product_name_English = pd.DataFrame(list(collection.find()))

product_name_English = product_name_English.drop('_id', axis=1)

product_name_English = spark.createDataFrame(product_name_English)

product_name_English.columns

['product_category_name', 'product_category_name_english']

## Data Cleaning

- Handle Missing Values

- Remove Duplicate Values

- Fix Data Types

- Standardize Categorical Values

- Normalize Date & Time

### Handling Missing Values

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
def missing_values(df, df_name):
    print(f'Missing Values in: {df_name}')
    df.select([count(when(col(c).isNull(), 1)).alias(c) for c in df.columns]).show()

In [0]:
for key, _ in datasets.items():
    missing_values(globals()[key], key)

Missing Values in: customer_df
+-----------+------------------+------------------------+-------------+--------------+
|customer_id|customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+-----------+------------------+------------------------+-------------+--------------+
|          0|                 0|                       0|            0|             0|
+-----------+------------------+------------------------+-------------+--------------+

Missing Values in: geolocation_df
+---------------------------+---------------+---------------+----------------+-----------------+
|geolocation_zip_code_prefix|geolocation_lat|geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+---------------+---------------+----------------+-----------------+
|                          0|              0|              0|               0|                0|
+---------------------------+---------------+---------------+----------------+-----------------+

Missing Value

In [0]:
for key, _ in datasets.items():
    globals()[key] = globals()[key].na.drop(how='all')

### Removing duplicate values

In [0]:
for key, _ in datasets.items():
    globals()[key] = globals()[key].dropDuplicates()

### Fixing Data Types

In [0]:
for key, _ in datasets.items():
    print(f'Schema: {key}')
    globals()[key].printSchema()

Schema: customer_df
root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)

Schema: geolocation_df
root
 |-- geolocation_zip_code_prefix: integer (nullable = true)
 |-- geolocation_lat: double (nullable = true)
 |-- geolocation_lng: double (nullable = true)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)

Schema: order_item_df
root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)

Schema: order_payment_df
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)

### Normalizing Date & Time

In [0]:
# Order Item DF
order_item_df = order_item_df.withColumn("shipping_limit_time", date_format("shipping_limit_date", "HH:mm:ss"))
order_item_df = order_item_df.withColumn('shipping_limit_date', to_date(col('shipping_limit_date')))

# Order Review DF
order_review_df = order_review_df.withColumn('review_answer_time', date_format('review_answer_timestamp', 'HH:mm:ss'))
order_review_df = order_review_df.withColumn('review_answer_date', to_date(col('review_answer_timestamp')))
order_review_df = order_review_df.drop('review_answer_timestamp')

# Order DF
order_df = order_df.withColumn('order_purchase_time', date_format('order_purchase_timestamp', 'HH:mm:ss'))
order_df = order_df.withColumn('order_purchase_date', to_date(col('order_purchase_timestamp')))
order_df = order_df.drop('order_purchase_timestamp')

order_df = order_df.withColumn('order_approved_at_time', date_format('order_approved_at', 'HH:mm:ss'))
order_df = order_df.withColumn('order_approved_at_date', to_date(col('order_approved_at')))
order_df = order_df.drop('order_approved_at')

order_df = order_df.withColumn('order_delivered_carrier_time', date_format('order_delivered_carrier_date', 'HH:mm:ss'))
order_df = order_df.withColumn('order_delivered_carrier_date', to_date(col('order_delivered_carrier_date')))

order_df = order_df.withColumn('order_delivered_customer_time', date_format('order_delivered_customer_date', 'HH:mm:ss'))
order_df = order_df.withColumn('order_delivered_customer_date', to_date(col('order_delivered_customer_date')))

order_df = order_df.withColumn('order_estimated_delivery_time', date_format('order_estimated_delivery_date', 'HH:mm:ss'))
order_df = order_df.withColumn('order_estimated_delivery_date', to_date(col('order_estimated_delivery_date')))

## Writing Cleaned Data Into Azure Storage Account

In [0]:
customer_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/customer")

order_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/order")

order_item_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/order_item")

order_payment_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/order_payment")

order_review_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/order_review")

product_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/product")

seller_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/seller")

geolocation_df.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/geolocation")

product_name_English.write.mode("overwrite").option("header", "true").csv(f"abfss://container-olist-ecommerce-data@{storage_account}.dfs.core.windows.net/Silver/product_name_English")
