After cleaning a dataset/dataframe (do it on the normalized table), you can save it under /data/cleaned_data/

Example:

### Cleaning city name in geolocation table

In [1]:
import pandas as pd
import os

In [2]:
date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

In [3]:
df_geo = read_olist_csv('../data/olist_geolocation_dataset.csv')
df_geo.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [4]:
df_geo['geolocation_city'].value_counts()

geolocation_city
sao paulo               135800
rio de janeiro           62151
belo horizonte           27805
são paulo                24918
curitiba                 16593
                         ...  
jacuípe                      1
mar vermelho                 1
quebrangulo                  1
poço das trincheiras         1
poxim                        1
Name: count, Length: 8011, dtype: int64

In [9]:
df_geo['geolocation_city'].unique()[:50]

array(['sao paulo', 'são paulo', 'sao bernardo do campo', 'jundiaí',
       'taboão da serra', 'sãopaulo', 'sp', 'sa£o paulo',
       'sao jose dos campos', 'osasco', 'carapicuíba', 'carapicuiba',
       'barueri', 'santana de parnaiba', 'pirapora do bom jesus',
       'santana de parnaíba', 'jandira', 'itapevi', 'cotia',
       'taboao da serra', 'vargem grande paulista', 'embu das artes',
       'itapecerica da serra', 'embu', 'são lourenço da serra',
       'sao lourenco da serra', 'embu-guacu', 'embu-guaçu', 'embu guaçu',
       'juquitiba', 'embu guacu', 'embuguacu', 'guarulhos', 'adamantina',
       'guarulhos-sp', 'aruja', 'arujá', 'santa isabel', 'mairipora',
       'mairiporã', 'cajamar', 'caieiras', 'jordanesia', 'polvilho',
       'mauá', 'jordanésia', 'franco da rocha', 'francisco morato', 'poa',
       'itaquaquecetuba'], dtype=object)

When starting to modify the original dataframe, copy the original dataframe into another variable (to prevent accidental changes on original dataframe which will affect other code)

In [12]:
cleaned_df_geo = df_geo.copy()
# Example cleaning: remove leading/trailing spaces and convert to lowercase
cleaned_df_geo['geolocation_city'] = cleaned_df_geo['geolocation_city'].str.strip().str.lower()
# Assume it's already cleaned for demonstration purposes
display(cleaned_df_geo.head())
# Save the cleaned dataframe into a new CSV file
cleaned_df_geo.to_csv('../data/cleaned_data/example_cleaned_olist_geolocation_dataset.csv', index=False)

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP
