# Importing data

In [2]:
#!kaggle datasets download -d olistbr/brazilian-ecommerce --unzip

In [3]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

In [4]:
# Read CSV files
df_olist_customers = pd.read_csv('olist_customers_dataset.csv')
df_olist_sellers = pd.read_csv('olist_sellers_dataset.csv')
df_olist_order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
df_olist_order_items = pd.read_csv('olist_order_items_dataset.csv')
df_olist_products = pd.read_csv('olist_products_dataset.csv')
df_olist_geolocation = pd.read_csv('olist_geolocation_dataset.csv')
df_product_category_name_translation = pd.read_csv('product_category_name_translation.csv')
df_olist_orders = pd.read_csv('olist_orders_dataset.csv')
df_olist_order_payments = pd.read_csv('olist_order_payments_dataset.csv')

# Convert column names to lowercase to avoid case sensitivity issues
for df in [
    df_olist_customers, df_olist_sellers, df_olist_order_reviews, 
    df_olist_order_items, df_olist_products, df_olist_geolocation, 
    df_product_category_name_translation, df_olist_orders, df_olist_order_payments
]:
    df.columns = df.columns.str.lower()

# Create db connection
db_user = 'postgres'
db_password = '1234'
db_host = 'localhost'  # or your database server's IP address
db_port = '5432'  # Default PostgreSQL port
db_name = 'my_postgres_db'  # Replace with your database's name

# Ensure the database exists
engine = create_engine(
    f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/postgres',
    echo=False
)
with engine.connect() as conn:
    conn.execute(text("commit"))
    result = conn.execute(text(f"SELECT 1 FROM pg_database WHERE datname = '{db_name}'"))
    if not result.scalar():
        conn.execute(text(f"CREATE DATABASE {db_name}"))

# PostgreSQL connection string
engine = create_engine(
    f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}',
    echo=False
)

# Export DataFrames to PostgreSQL
df_olist_customers.to_sql(
    "olist_customers", con=engine, if_exists='replace', index=False
)
df_olist_sellers.to_sql(
    "olist_sellers", con=engine, if_exists='replace', index=False
)
df_olist_order_reviews.to_sql(
    "olist_order_reviews", con=engine, if_exists='replace', index=False
)
df_olist_order_items.to_sql(
    "olist_order_items", con=engine, if_exists='replace', index=False
)
df_olist_products.to_sql(
    "olist_products_dataset", con=engine, if_exists='replace', index=False
)

df_olist_geolocation.to_sql(
    "olist_geolocation", con=engine, if_exists='replace', index=False
    )

df_product_category_name_translation.to_sql(
    "product_category_name_translation", con=engine, if_exists='replace', index=False
    )

df_olist_orders.to_sql(
    "olist_orders", con=engine, if_exists='replace', index=False
    )

df_olist_order_payments.to_sql(
    "olist_order_payments", con=engine, if_exists='replace', index=False
    )

# Verify successful data export
print("Data exported successfully to PostgreSQL.")

Data exported successfully to PostgreSQL.


## Setting up foreign keys

In [5]:
# Function to check if a constraint already exists in the database
def check_constraint_exists(engine, constraint_name):
    # SQL query to check for the existence of a constraint with the specified name
    check_constraint_sql = text("""
        SELECT conname 
        FROM pg_constraint
        WHERE conname = :constraint_name;
    """)
    with engine.connect() as con:
        # Execute the query and check if any results are returned
        result = con.execute(check_constraint_sql, {'constraint_name': constraint_name}).fetchone()
    return result is not None

# Function to create a relationship (foreign key) between two tables
def create_relationship(from_, to_, column_name, relationship_type, engine):
    # Validate the relationship type
    if relationship_type not in ['one_to_one', 'many_to_one']:
        raise ValueError("Invalid relationship type. Use 'one_to_one' or 'many_to_one'.")

    # Define constraint names for uniqueness and foreign key relationships
    unique_constraint_to = f"uq_{to_}_{column_name}"
    unique_constraint_from = f"uq_{from_}_{column_name}" if relationship_type == 'one_to_one' else None

    # Add a unique constraint to the referenced table if it doesn't already exist
    if not check_constraint_exists(engine, unique_constraint_to):
        try:
            with engine.begin() as con:
                con.execute(text(f"""
                    ALTER TABLE {to_}
                    ADD CONSTRAINT {unique_constraint_to} UNIQUE ({column_name});
                """))
                print(f"Unique constraint added to {to_} on {column_name}")
        except Exception as e:
            print(f"Error adding unique constraint to {to_}: {e}")
    else:
        print(f"Unique constraint '{unique_constraint_to}' already exists on {to_}")

    # If the relationship is one-to-one, add a unique constraint to the referencing table
    if relationship_type == 'one_to_one' and unique_constraint_from:
        if not check_constraint_exists(engine, unique_constraint_from):
            try:
                with engine.begin() as con:
                    con.execute(text(f"""
                        ALTER TABLE {from_}
                        ADD CONSTRAINT {unique_constraint_from} UNIQUE ({column_name});
                    """))
                    print(f"Unique constraint added to {from_} on {column_name}")
            except Exception as e:
                print(f"Error adding unique constraint to {from_}: {e}")
        else:
            print(f"Unique constraint '{unique_constraint_from}' already exists on {from_}")

    # Define the foreign key constraint name
    foreign_key_name = f"fk_{from_}_{to_}_{column_name}"

    # Add the foreign key constraint if it doesn't already exist
    if not check_constraint_exists(engine, foreign_key_name):
        try:
            with engine.begin() as con:
                con.execute(text(f"""
                    ALTER TABLE {from_}
                    ADD CONSTRAINT {foreign_key_name} FOREIGN KEY ({column_name})
                    REFERENCES {to_} ({column_name}) ON DELETE CASCADE;
                """))
                print(f"Foreign key '{foreign_key_name}' added to {from_} referencing {to_}")
        except Exception as e:
            print(f"Error adding foreign key to {from_}: {e}")
    else:
        print(f"Foreign key '{foreign_key_name}' already exists on {from_}")

In [6]:
create_relationship(from_='olist_order_reviews', to_='olist_orders', column_name='order_id', relationship_type='many_to_one', engine=engine)
create_relationship('olist_order_payments', 'olist_orders', 'order_id', 'many_to_one', engine)
create_relationship('olist_order_items', 'olist_orders', 'order_id', 'many_to_one', engine)

create_relationship('olist_order_items', 'olist_products_dataset', 'product_id', 'many_to_one', engine)
create_relationship('olist_order_items', 'olist_sellers', 'seller_id', 'many_to_one', engine)

create_relationship('olist_orders', 'olist_customers', 'customer_id', 'one_to_one', engine)

Unique constraint added to olist_orders on order_id
Foreign key 'fk_olist_order_reviews_olist_orders_order_id' added to olist_order_reviews referencing olist_orders
Unique constraint 'uq_olist_orders_order_id' already exists on olist_orders
Foreign key 'fk_olist_order_payments_olist_orders_order_id' added to olist_order_payments referencing olist_orders
Unique constraint 'uq_olist_orders_order_id' already exists on olist_orders
Foreign key 'fk_olist_order_items_olist_orders_order_id' added to olist_order_items referencing olist_orders
Unique constraint added to olist_products_dataset on product_id
Foreign key 'fk_olist_order_items_olist_products_dataset_product_id' added to olist_order_items referencing olist_products_dataset
Unique constraint added to olist_sellers on seller_id
Foreign key 'fk_olist_order_items_olist_sellers_seller_id' added to olist_order_items referencing olist_sellers
Unique constraint added to olist_customers on customer_id
Unique constraint added to olist_orders 

# Change datatypes

In [7]:
def change_columns_to_datetime(engine, table_columns_dict):
    with engine.begin() as con:
        # Loop through the dictionary and alter the data type for each column in each table
        for table, columns in table_columns_dict.items():
            for column in columns:
                alter_query = text(f'ALTER TABLE {table} ALTER COLUMN {column} TYPE TIMESTAMP USING {column}::TIMESTAMP;')
                con.execute(alter_query)
                print(f"Changed {column} in {table} to TIMESTAMP.")


table_columns_dict = {
    "olist_order_items": ["shipping_limit_date"],
    "olist_order_reviews": ["review_creation_date", "review_answer_timestamp"],
    "olist_orders": ["order_purchase_timestamp", "order_approved_at", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date"],
}

change_columns_to_datetime(engine, table_columns_dict)


Changed shipping_limit_date in olist_order_items to TIMESTAMP.
Changed review_creation_date in olist_order_reviews to TIMESTAMP.
Changed review_answer_timestamp in olist_order_reviews to TIMESTAMP.
Changed order_purchase_timestamp in olist_orders to TIMESTAMP.
Changed order_approved_at in olist_orders to TIMESTAMP.
Changed order_delivered_carrier_date in olist_orders to TIMESTAMP.
Changed order_delivered_customer_date in olist_orders to TIMESTAMP.
Changed order_estimated_delivery_date in olist_orders to TIMESTAMP.


# Tasks

## Query 1: Count and Percentage of Orders Purchased in Jan 2018 with 5 Review Score

In [8]:
# Write and execute a SQL query to count the number of orders purchased in January 2018 that have a review score of 5 and calculate the percentage of such orders.
df = pd.read_sql('''
    SELECT * 
    FROM olist_orders
    LEFT JOIN olist_order_reviews 
    ON olist_orders.order_id = olist_order_reviews.order_id
    WHERE EXTRACT(YEAR FROM order_purchase_timestamp) = 2018
    AND EXTRACT(MONTH FROM order_purchase_timestamp) = 1
    AND review_score = 5
''', con=engine)



# Drop the duplicate 'order_id' column
df = df.loc[:, ~df.columns.duplicated()]

df.to_sql(
    "query1", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,review_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,cf73e2cb1f4a9480ed70c154da3d954a,ef1ed1347404166e73e8f3977ffcbca0,delivered,2018-01-11 11:23:09,2018-01-12 02:38:33,2018-01-13 01:07:00,2018-01-17 14:36:38,2018-02-01,540e7bbb2d06cfb7f85f3a88ba7ac97f,5,,,2018-01-18,2018-01-18 19:12:30
1,1a36784e5c696071dad8a242f26b230f,600a3fb4f5eab2e7de4eee159be73c81,delivered,2018-01-16 15:53:41,2018-01-17 03:36:48,2018-01-17 17:09:06,2018-01-24 17:42:56,2018-02-06,e363a3c172db72d6a8b82d0726cd2ddb,5,,,2018-01-25,2018-02-18 23:03:20
2,83177602aad708cea549501386b021c6,329e57e436b3d2e6629801cf5724368f,delivered,2018-01-22 21:17:58,2018-01-24 11:12:35,2018-01-26 17:52:42,2018-01-29 12:58:46,2018-02-06,7df70380ca80dea85f0a7f1f57fa2f87,5,,,2018-01-30,2018-01-30 18:19:27
3,77fc7da5ef4df2c48276ae832072dcf1,a260ca264a1787a1842ced2735bff664,delivered,2018-01-07 22:52:49,2018-01-07 23:07:11,2018-01-08 21:10:13,2018-01-09 15:57:34,2018-01-24,766fc608e4f2683020fb0a121d9db23e,5,,A bolsa térmica além de linda é super espaçosa...,2018-01-10,2018-01-11 07:47:10
4,75ad3bc2c20ac416220546b2268066dc,810eaa5351d02199e8f4b53de15008df,delivered,2018-01-23 11:03:58,2018-01-23 11:15:21,2018-01-26 17:09:22,2018-02-27 22:41:44,2018-02-21,2613eaca6e203ad90206bd06e3dc5684,5,,,2018-02-23,2018-03-26 18:13:48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4092,914bd7644db3ffde7e1babea42768e0b,3eb7136aaa3e0bab17a7b7e7280286a5,delivered,2018-01-27 11:08:01,2018-01-27 11:16:21,2018-01-29 18:29:34,2018-02-09 16:38:27,2018-02-23,a3ea2fcbc2ea3b9377770bd87d5ba6db,5,,,2018-02-10,2018-02-13 14:24:52
4093,7209ce21354cff1e1957b42ea0a76cd1,d2000bb65d12d7c93b7e27e6c961757d,delivered,2018-01-11 22:59:49,2018-01-11 23:10:26,2018-01-13 15:33:47,2018-01-18 21:42:54,2018-01-26,2e29a1dc332760b4324f9963d118347f,5,,,2018-01-19,2018-01-20 09:20:28
4094,ef29bdd3a72229e6ea76fb22871eb176,ff83d65f64369c7efaa074eb0a13c433,delivered,2018-01-15 22:45:22,2018-01-15 22:55:25,2018-01-18 15:49:11,2018-01-24 19:49:15,2018-02-09,48c9128a683d613c23c6c2b93b16c574,5,,,2018-01-25,2018-01-28 08:16:43
4095,2f92b812fd7c7df5b629833c484cdf1d,18f6ca10777417c932cf49e7fc9387ff,delivered,2018-01-18 14:40:57,2018-01-18 14:56:18,2018-01-19 18:48:40,2018-02-01 19:20:39,2018-02-15,29f15fe767c6de620f026cf5eb9da330,5,,Bom atendimento diante do que foi solicitado,2018-02-02,2018-02-05 13:40:42


## Query 2: Customer Purchase Trend Year-on-Year

In [9]:
# Write and execute a SQL query to analyze the customer purchase trend year-on-year.

df = pd.read_sql('''
    SELECT EXTRACT(YEAR FROM order_purchase_timestamp) AS year, COUNT(order_id) AS total_orders
    FROM olist_orders
    GROUP BY year
    ORDER BY year;
''', con=engine)

df.to_sql(
    "query2", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,year,total_orders
0,2016.0,329
1,2017.0,45101
2,2018.0,54011


## Query 3: Average Order Values of Customers

In [10]:
# Write and execute a SQL query to calculate the average order values of customers.

df = pd.read_sql('''
    SELECT customer_id, AVG(payment_value) AS avg_order_value
    FROM olist_orders
    LEFT JOIN olist_order_payments ON olist_orders.order_id = olist_order_payments.order_id
    GROUP BY customer_id
    ORDER BY avg_order_value DESC;
''', con=engine)

df.to_sql(
    "query3", con=engine, if_exists='replace', index=False
) 

df

Unnamed: 0,customer_id,avg_order_value
0,86dc2ffce2dfff336de2f386a786e574,
1,1617b1357756262bfa56ab541c47bc16,13664.080000
2,ec5b2ba62e574342386871631fafd3fc,7274.880000
3,c6e2731c5b391845f6800c97401a43a9,6929.310000
4,f48d464a0baaea338cb25f816991ab1f,6922.210000
...,...,...
99436,b246eeed30b362c09d867b9e598bee51,1.856818
99437,fd123d346a17cdf5e37a2a85501069bf,1.737500
99438,a73c1f73f5772cf801434bf984b0b1a7,0.000000
99439,3532ba38a3fd242259a514ac2b6ae6b6,0.000000


## Query 4: Top 5 Cities with Highest Revenue from 2016 to 2018

In [11]:
# Write and execute a SQL query to find the top 5 cities with the highest revenue from 2016 to 2018

df = pd.read_sql('''
    SELECT customer_city, SUM(payment_value) AS total_revenue
    FROM olist_orders
    LEFT JOIN olist_order_payments ON olist_orders.order_id = olist_order_payments.order_id
    LEFT JOIN olist_customers ON olist_orders.customer_id = olist_customers.customer_id
    GROUP BY customer_city
    ORDER BY total_revenue DESC
    LIMIT 5;
''', con=engine)

df.to_sql(
    "query4", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,customer_city,total_revenue
0,sao paulo,2203373.09
1,rio de janeiro,1161927.36
2,belo horizonte,421765.12
3,brasilia,354216.78
4,curitiba,247392.48


## Query 5: State Wise Revenue Table Between 2016 to 2018

In [12]:
# Write and execute a SQL query to create a state-wise revenue table between 2016 to 2018

df = pd.read_sql('''
    SELECT customer_state, SUM(payment_value) AS total_revenue
    FROM olist_orders
    LEFT JOIN olist_order_payments ON olist_orders.order_id = olist_order_payments.order_id
    LEFT JOIN olist_customers ON olist_orders.customer_id = olist_customers.customer_id
    GROUP BY customer_state
    ORDER BY total_revenue DESC;
''', con=engine)

df.to_sql(
    "query5", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,customer_state,total_revenue
0,SP,5998226.96
1,RJ,2144379.69
2,MG,1872257.26
3,RS,890898.54
4,PR,811156.38
5,SC,623086.43
6,BA,616645.82
7,DF,355141.08
8,GO,350092.31
9,ES,325967.55


## Query 6: Top Successful Sellers in Terms of Goods Sold, Revenue, and Customer Count

In [13]:
# Write and execute a SQL query to identify the top successful sellers in terms of the number of goods sold, total revenue, customer count, and sellers with the highest 5-star ratings.

df = pd.read_sql('''
    SELECT olist_order_items.seller_id, COUNT(olist_order_items.product_id) AS total_products_sold, SUM(olist_order_items.price) AS total_revenue, COUNT(DISTINCT olist_orders.customer_id) AS total_customers, AVG(olist_order_reviews.review_score) AS avg_review_score
    FROM olist_order_items
    LEFT JOIN olist_sellers ON olist_order_items.seller_id = olist_sellers.seller_id
    LEFT JOIN olist_order_reviews ON olist_order_items.order_id = olist_order_reviews.order_id
    LEFT JOIN olist_orders ON olist_order_items.order_id = olist_orders.order_id
    GROUP BY olist_order_items.seller_id
    ORDER BY total_products_sold DESC, total_revenue DESC, total_customers DESC, avg_review_score DESC
    LIMIT 5;
''', con=engine)

df.to_sql(
    "query6", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,seller_id,total_products_sold,total_revenue,total_customers,avg_review_score
0,6560211a19b47992c3666cc44a7e94c0,2039,123585.82,1854,3.909406
1,4a3ca9315b744ce9f8e9374361493884,2009,202999.12,1806,3.803931
2,1f50f920176fa81dab994f9023523100,1940,107431.41,1404,3.982402
3,cc419e0650a3c5ba77189a1882b7556a,1819,106555.98,1706,4.069575
4,da8622b14eb17ae2831f4ac5b9dab84a,1574,162723.37,1314,4.071429


## Query 7: Delivery Success Rate Across States

In [14]:
# Write and execute a SQL query to calculate the delivery success rate across different states.

df = pd.read_sql('''
    SELECT olist_customers.customer_state, 
    COUNT(order_delivered_customer_date) AS total_delivered_orders, 
    COUNT(olist_orders.order_id) AS total_orders,
    (COUNT(order_delivered_customer_date)::NUMERIC / COUNT(olist_orders.order_id) * 100) AS delivery_success_rate
    FROM olist_orders
    LEFT JOIN olist_customers ON olist_orders.customer_id = olist_customers.customer_id
    GROUP BY olist_customers.customer_state
    ORDER BY delivery_success_rate DESC;
''', con=engine)

df.to_sql(
    "query7", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,customer_state,total_delivered_orders,total_orders,delivery_success_rate
0,AC,80,81,98.765432
1,AP,67,68,98.529412
2,ES,1995,2033,98.130841
3,MS,701,715,98.041958
4,AM,145,148,97.972973
5,TO,274,280,97.857143
6,RS,5344,5466,97.76802
7,RN,474,485,97.731959
8,MT,886,907,97.684675
9,MG,11355,11635,97.593468


## Query 8: Preferred Form of Payment for Different Categories

In [15]:
# Write and execute a SQL query to find the preferred form of payment for different product categories

df = pd.read_sql('''
SELECT DISTINCT ON (product_category_name) 
    product_category_name, 
    payment_type, 
    total_payments
FROM (
    SELECT 
        product_category_name, 
        payment_type, 
        COUNT(payment_type) AS total_payments
    FROM olist_order_items
    LEFT JOIN olist_products_dataset ON olist_order_items.product_id = olist_products_dataset.product_id
    LEFT JOIN olist_order_payments ON olist_order_items.order_id = olist_order_payments.order_id
    GROUP BY product_category_name, payment_type
    ORDER BY product_category_name, total_payments DESC
) AS subquery
ORDER BY product_category_name, total_payments DESC;
''', con=engine)

df.to_sql(
    "query8", con=engine, if_exists='replace', index=False
)

df

Unnamed: 0,product_category_name,payment_type,total_payments
0,agro_industria_e_comercio,credit_card,145
1,alimentos,credit_card,381
2,alimentos_bebidas,credit_card,194
3,artes,credit_card,153
4,artes_e_artesanato,credit_card,14
...,...,...,...
69,tablets_impressao_imagem,credit_card,65
70,telefonia,credit_card,3400
71,telefonia_fixa,credit_card,186
72,utilidades_domesticas,credit_card,5411


## Query 9: Distance Between Cities

# Write and execute a SQL query to calculate the distance between cities.

When launched from the IDE, it crashes after a minute on my computer, but it is executable in pgadmin

```python
df = pd.read_sql('''
WITH CityCoordinates AS (
    SELECT 
        geolocation_city,
        AVG(geolocation_lat) AS latitude,
        AVG(geolocation_lng) AS longitude
    FROM olist_geolocation
    GROUP BY geolocation_city
)


SELECT 
    a.geolocation_city AS city_a,
    b.geolocation_city AS city_b,
    6371 * acos(
        cos(radians(a.latitude)) * cos(radians(b.latitude)) *
        cos(radians(b.longitude) - radians(a.longitude)) +
        sin(radians(a.latitude)) * sin(radians(b.latitude))
    ) AS distance_km
FROM CityCoordinates a
CROSS JOIN CityCoordinates b
WHERE a.geolocation_city < b.geolocation_city;
''', con=engine)

```


# Export database

In [16]:
# Export the entire database with data and relationships to a SQL file
import subprocess

print("Exporting database to SQL file...")
try:
    subprocess.run(
        [
            "pg_dump", "--dbname=postgresql://{}:{}@{}:{}/{}".format(
                db_user, db_password, db_host, db_port, db_name
            ), "-f", "database_export.sql"
        ],
        check=True
    )
    print("Database export completed.")
except subprocess.CalledProcessError as e:
    print(f"Error exporting database: {e}")

Exporting database to SQL file...
Database export completed.
