# Session 3 — Profiling & Cleaning Data

Assess data quality with SQL and seaborn; fix inconsistencies with UPDATE.


## Environment Setup

In [None]:
import sys, sqlite3, pandas as pd, numpy as np, matplotlib.pyplot as plt
print(sys.version)
import seaborn as sns
sns.set_theme()
from pathlib import Path
DB_PATH = Path('course.db')
conn = sqlite3.connect(DB_PATH)
conn.execute('PRAGMA foreign_keys=ON;')
print('SQLite ready at', DB_PATH.resolve())

In [None]:
def run_sql(q, params=None):
    params = params or {}
    df = pd.read_sql_query(q, conn, params=params)
    display(df)
    return df

## 1. Profiling Basics

In [None]:
run_sql("SELECT COUNT(*) AS n_customers FROM customers;")
run_sql("SELECT MIN(price) AS min_price, MAX(price) AS max_price, AVG(price) AS avg_price FROM products;")
missing = run_sql("SELECT * FROM customers WHERE address IS NULL;")

## 2. Visualize Distributions

In [None]:
dfp = pd.read_sql_query("SELECT * FROM products;", conn)
sns.histplot(dfp['price'], bins=10, kde=True)
plt.title("Price Distribution"); plt.xlabel("Price"); plt.ylabel("Count"); plt.show()

## 3. Cleaning Examples

In [None]:
conn.execute("UPDATE customers SET address='Unknown' WHERE address IS NULL;")
conn.commit()
run_sql("SELECT * FROM customers;")