In [None]:
%run ../../_pre_run.ipynb

# Data Loading

- Load the data.
- Assign data types to columns where possible. Columns with missing values cannot be immediately converted to integer type. Leave such columns as is for now.
- Display the first few rows of each dataframe.
- Examine column types.
- For categorical variables, perform normalization and convert to Title Case format for consistency and better visual presentation.
- For text variables, normalize and convert to lowercase to eliminate implicit duplicates.

In [None]:
base_url = "https://raw.githubusercontent.com/PavelGrigoryevDS/olist-deep-dive/main/data/"


**Table df_orders**

In [None]:
dtype = {'order_status': 'category'}
df_orders = pd.read_csv(f'{base_url}olist_orders_dataset.csv.gz', dtype=dtype
                , parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'], date_format='%Y-%m-%d %H:%M:%S')
df_orders.rename(columns={'order_purchase_timestamp': 'order_purchase_dt'
                          , 'order_approved_at': 'order_approved_dt'
                          , 'order_delivered_carrier_date': 'order_delivered_carrier_dt'
                          , 'order_delivered_customer_date': 'order_delivered_customer_dt'
                          , 'order_estimated_delivery_date': 'order_estimated_delivery_dt'}, inplace=True)
df_orders = fo(df_orders)
df_orders.sample(5, random_state=7)

In [None]:
df_orders.dtypes

In [None]:
df_orders.order_status.preproc.normalize_string_series(inplace=True)

**Table df_payments**

In [None]:
dtype = {'payment_type': 'category'}
df_payments = pd.read_csv(f'{base_url}olist_order_payments_dataset.csv.gz', dtype=dtype)
df_payments = fo(df_payments)
df_payments.sample(5, random_state=7)

In [None]:
df_payments.dtypes

In [None]:
df_payments.payment_type.preproc.normalize_string_series(inplace=True)

**Table df_items**

In [None]:
df_items = pd.read_csv(f'{base_url}olist_order_items_dataset.csv.gz'
                , parse_dates=['shipping_limit_date'], date_format='%Y-%m-%d %H:%M:%S')
df_items.rename(columns={'shipping_limit_date': 'shipping_limit_dt'}, inplace=True)
df_items = fo(df_items)
df_items.sample(5, random_state=7)

In [None]:
df_items.dtypes

**Table df_customers**

In [None]:
dtype = {'customer_city': 'category', 'customer_state': 'category'}
df_customers = pd.read_csv(f'{base_url}olist_customers_dataset.csv.gz', dtype=dtype)
df_customers = fo(df_customers)
df_customers.sample(5, random_state=7)

In [None]:
df_customers.dtypes

In [None]:
df_customers.customer_city.preproc.normalize_string_series(inplace=True)
df_customers.customer_state.preproc.normalize_string_series(case_format='upper', inplace=True)

**Table df_reviews**

In [None]:
df_reviews = pd.read_csv(f'{base_url}olist_order_reviews_dataset_translated.csv.gz'
                , parse_dates=['review_creation_date', 'review_answer_timestamp']
                , date_format={'review_creation_date': '%Y-%m-%d', 'review_answer_timestamp': '%Y-%m-%d %H:%M:%S'})
df_reviews.rename(columns={'review_creation_date': 'review_creation_dt'
                           , 'review_answer_timestamp': 'review_answer_dt'}, inplace=True)
df_reviews = fo(df_reviews)
df_reviews.sample(5, random_state=7)

In [None]:
df_reviews.dtypes

In [None]:
df_reviews.review_comment_title.preproc.normalize_string_series(case_format='lower', inplace=True)
df_reviews.review_comment_message.preproc.normalize_string_series(case_format='lower', inplace=True)

**Table df_products**

In [None]:
dtype = {'product_category_name': 'category'}
df_products = pd.read_csv(f'{base_url}olist_products_dataset.csv.gz', dtype=dtype)
df_products = fo(df_products)
df_products.sample(5, random_state=7)

In [None]:
df_products.dtypes

We will not normalize the product_category_name column because we will replace it with an English version.

**Table df_categories**

In [None]:
dtype = {'product_category_name': 'category', 'product_category_name_english': 'category'}
df_categories = pd.read_csv(f'{base_url}product_category_name_translation.csv.gz', dtype=dtype)
df_categories = fo(df_categories)
df_categories.sample(5, random_state=7)

In [None]:
df_categories.dtypes

We will perform normalization only for the English version of the column because we will be working with it exclusively.

In [None]:
df_categories.product_category_name_english.preproc.normalize_string_series(inplace=True)

**Table df_sellers**

In [None]:
dtype = {'seller_city': 'category', 'seller_state': 'category'}
df_sellers = pd.read_csv(f'{base_url}olist_sellers_dataset.csv.gz', dtype=dtype)
df_sellers = fo(df_sellers)
df_sellers.sample(5, random_state=7)

In [None]:
df_sellers.dtypes

In [None]:
df_sellers.seller_city.preproc.normalize_string_series(inplace=True)
df_sellers.seller_state.preproc.normalize_string_series(case_format='upper', inplace=True)

**Table df_geolocations**

In [None]:
dtype = {'geolocation_city': 'category', 'geolocation_state': 'category'}
df_geolocations = pd.read_csv(f'{base_url}olist_geolocation_dataset.csv.gz', dtype=dtype)
df_geolocations = fo(df_geolocations)
df_geolocations.sample(5, random_state=7)

In [None]:
df_geolocations.dtypes

In [None]:
df_geolocations.geolocation_city.preproc.normalize_string_series(inplace=True)
df_geolocations.geolocation_state.preproc.normalize_string_series(case_format='upper', inplace=True)

We will combine all dataframes into a class for easier further work.

In [None]:
class Dfs:
    def __init__(self):
        self.orders = df_orders
        self.items = df_items
        self.reviews = df_reviews
        self.products = df_products
        self.geolocations = df_geolocations
        self.sellers = df_sellers
        self.payments = df_payments
        self.customers = df_customers
        self.categories = df_categories

    def __iter__(self):
        return iter([
            ('orders', self.orders),
            ('items', self.items),
            ('reviews', self.reviews),
            ('products', self.products),
            ('geolocations', self.geolocations),
            ('sellers', self.sellers),
            ('payments', self.payments),
            ('customers', self.customers),
            ('categories', self.categories),
        ])
dfs = Dfs()

In [None]:
%run ../../_post_run.ipynb