In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os # For listing files
import warnings

warnings.filterwarnings('ignore') # Suppress warnings, especially from pandas

# Configure pandas to display more rows/columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [None]:
data_path = r"C:\Users\HP\Downloads\archive"
#listing CSV files in the dataset
print("CSV files found in the raw data folder:")
for file_name in os.listdir(data_path):
    if file_name.endswith('.csv'):
        print(f"- {file_name}")

## Loading each CSV file into a separate pandas DataFrame
customers_df = pd.read_csv(os.path.join(data_path, 'olist_customers_dataset.csv'))
orders_df = pd.read_csv(os.path.join(data_path, 'olist_orders_dataset.csv'))
order_items_df = pd.read_csv(os.path.join(data_path, 'olist_order_items_dataset.csv'))
order_payments_df = pd.read_csv(os.path.join(data_path, 'olist_order_payments_dataset.csv'))
products_df = pd.read_csv(os.path.join(data_path, 'olist_products_dataset.csv'))
order_reviews_df = pd.read_csv(os.path.join(data_path, 'olist_order_reviews_dataset.csv'))
product_category_translation_df = pd.read_csv(os.path.join(data_path, 'product_category_name_translation.csv'))

In [None]:
# Checking orders_df
print("--- orders_df ---")
print(orders_df.head())
print(orders_df.info())
print(orders_df.describe(include='all'))
print(orders_df.isnull().sum())
print(orders_df.nunique())
print(orders_df['order_status'].value_counts())

# Checking customers_df
print("\n--- customers_df ---")
print(customers_df.head())
print(customers_df.info())
print(customers_df.describe(include='all'))
print(customers_df.isnull().sum())
print(customers_df.nunique())
# Check for duplicate customer_unique_id
print(f"Number of duplicate customer_unique_id: {customers_df['customer_unique_id'].duplicated().sum()}")

#checking order items df
print(order_items_df.head())
print(order_items_df.info())
print(order_items_df.describe(include='all'))
print(order_items_df.isnull().sum())
print(order_items_df.nunique())

#checking order payments df
print(order_payments_df.head())
print(order_payments_df.info())
print(order_payments_df.describe(include='all'))
print(order_payments_df.isnull()sum())

#checking products df
print(products_df.info())
print(products_df.isnull.sum())
print(products_df.describe(include = 'all'))

### Initial Observations

Based on this first look at the data:

1. Data Types : Many date/time columns (like `order_purchase_timestamp`) are currently `object` (string) type. They will need to be converted to `datetime` objects for time-based analysis.
2. Missing Values : We see missing values in several columns across different tables (e.g., `order_delivered_customer_date` in `orders_df`, `product_photos_qty` in `products_df`, `review_comment_message` in `order_reviews_df`). We'll need to decide how to handle these (e.g., drop rows, fill with a default value).
3. Product Categories : `products_df` has product categories in Portuguese (`product_category_name`). `product_category_translation_df` exists to translate these to English (`product_category_name_english`), which will be essential for clearer analysis.
4. Order Statuses : The `orders_df` has various `order_status` values. For CLTV analysis, we'll likely focus primarily on 'delivered' orders, but other statuses might be useful for understanding cancellations or delivery issues.
5. Unique Customer ID : The `customers_df` has `customer_unique_id`, which seems to be the key for identifying a single customer across multiple orders. We need to ensure this is used correctly in our analysis.
6. Multiple Tables : This dataset is spread across many tables. The next step will involve carefully merging these tables using common ID columns (`order_id`, `customer_id`, `product_id`, etc.) to create a single, comprehensive dataset.
