<a href="https://colab.research.google.com/github/SupunGurusinghe/sqlite-plus-colab/blob/main/sg_project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import pandas as pd

## DB Connection

In [2]:
import sqlite3
conn = sqlite3.connect('example.db')
c = conn.cursor()

# **Basic NULL values handling**

## Creating a table

In [64]:
# dropping an existing table
c.execute("DROP TABLE IF EXISTS employees")

# create table
c.execute('''
  CREATE TABLE employees(
    id INT,
    f_name VARCHAR(50),
    l_name VARCHAR(50),
    title VARCHAR(10),
    age INT,
    wage INT,
    hire_date DATE
  )
''')

employees = [(1, 'kavishka', 'tim', 'Mr', 22, 28, '2022-05-01'), 
             (2, 'Bill', 'Tibb', 'Mr', 61, 28, '2012-05-02'), 
             (3, 'Bill', 'Sadat', None, 18, 12, '2019-11-08'),
             (4, 'Christine', 'Riveles', None, 36, 20, '2018-03-30'),
             (5, 'David', 'Guerin', 'Honorable', 36, 20, '2018-03-30'),
             (None, 'David', 'Guerin', 'Honorable', 36, 20, '2018-03-30')]

c.executemany("INSERT INTO employees VALUES (?,?,?,?,?,?,?)", employees)

c.execute('SELECT * FROM employees')

results = c.fetchall()

for result in results:
  print(result)


(1, 'kavishka', 'tim', 'Mr', 22, 28, '2022-05-01')
(2, 'Bill', 'Tibb', 'Mr', 61, 28, '2012-05-02')
(3, 'Bill', 'Sadat', None, 18, 12, '2019-11-08')
(4, 'Christine', 'Riveles', None, 36, 20, '2018-03-30')
(5, 'David', 'Guerin', 'Honorable', 36, 20, '2018-03-30')
(None, 'David', 'Guerin', 'Honorable', 36, 20, '2018-03-30')


## Identify `NULL` values

In [65]:
c.execute('''
  SELECT 
    SUM(case when id IS NULL then 1 ELSE 0 END) AS id ,
    SUM(case when f_name IS NULL then 1 ELSE 0 END) AS f_name,
    SUM(case when l_name IS NULL then 1 ELSE 0 END) AS l_name,
    SUM(case when title IS NULL then 1 ELSE 0 END) AS title,
    SUM(case when age IS NULL then 1 ELSE 0 END) AS age,
    SUM(case when wage IS NULL then 1 ELSE 0 END) AS wage,
    SUM(case when hire_date IS NULL then 1 ELSE 0 END) AS hire_date
  FROM employees
''')

results = c.fetchall()

for result in results:
  print(result)

(1, 0, 0, 2, 0, 0, 0)


## Selecting rows having `NULL` value for a particular column

In [66]:
c.execute('''
  SELECT *
  FROM employees
  WHERE title IS NULL
''')

results = c.fetchall()

for result in results:
  print(result)

(3, 'Bill', 'Sadat', None, 18, 12, '2019-11-08')
(4, 'Christine', 'Riveles', None, 36, 20, '2018-03-30')


## Selecting rows having `NULL` value for any of the columns

In [67]:
c.execute('''
  SELECT *
  FROM employees
  WHERE (id || f_name || l_name || title || age || wage || hire_date) IS NULL
''')

results = c.fetchall()

for result in results:
  print(result)

(3, 'Bill', 'Sadat', None, 18, 12, '2019-11-08')
(4, 'Christine', 'Riveles', None, 36, 20, '2018-03-30')
(None, 'David', 'Guerin', 'Honorable', 36, 20, '2018-03-30')


## Deleting rows where column value is null

In [68]:
c.execute('''
  DELETE FROM employees WHERE id IS NULL
''')

<sqlite3.Cursor at 0x7fda7310a340>

## Dropping a column when all rows have null for that column

In [72]:
# If all null

c.execute('''SELECT id FROM employees GROUP BY id''')

results = c.fetchall()

for result in results:
  print(result)

(1,)
(2,)
(3,)
(4,)
(5,)


In [73]:
# If all null

c.execute('''SELECT DISTINCT id FROM employees''')

results = c.fetchall()

for result in results:
  print(result)

(1,)
(2,)
(3,)
(4,)
(5,)


In [74]:
# If zero (0)

c.execute('''SELECT count(id) FROM employees WHERE id IS NOT NULL''')

results = c.fetchall()

for result in results:
  print(result)

(5,)


In [None]:
#  if you only get back just NULL (or 0 for that last one)

c.execute('''alter table employees drop column id''')

## Replace `NULL` values with a sentinel (standard value)

In [75]:
c.execute('''
  SELECT 
    f_name,
    l_name,
    CASE WHEN title IS NULL THEN 'Honorable' ELSE title END AS NewTitle
  FROM employees
''')

results = c.fetchall()

for result in results:
  print(result)

('kavishka', 'tim', 'Mr')
('Bill', 'Tibb', 'Mr')
('Bill', 'Sadat', 'Honorable')
('Christine', 'Riveles', 'Honorable')
('David', 'Guerin', 'Honorable')


# **Advance NULL values handling**

## Replace by an statistical technique such as mean, median, or mode

In [54]:
# dropping an existing table
c.execute("DROP TABLE IF EXISTS house_price")

# create table
c.execute('''
  CREATE TABLE house_price(
    id INT,
    country VARCHAR(50),
    city VARCHAR(50),
    price DOUBLE,
    a DOUBLE,
    b DOUBLE,
    c DOUBLE
  )
''')

house_price = [(1, 'USA', 'LA', 1000000.00, 1, 3, 5), 
             (2, 'UK', 'London', 400000.00, None, 5, 7), 
             (3, 'USA', 'LA', 850000.00, 9, None, None),
             (4, 'USA', 'LA', None, 12, 4, 9),
             (5, 'USA', 'LA', 900000.00, 2, 6, 1),
             (6, 'UK', 'London', 550000.00, None, 4, 8),
             (7, 'USA', 'LA', 1000000.00, 8, 8, 8), 
             (8, 'UK', 'London', 400000.00, 1, 4, 9), 
             (9, 'USA', 'LA', 850000.00, 4, 4, 5),
             (10, 'USA', 'LA', 1050000.00, None, None, None),
             (11, 'USA', 'LA', 900000.00, 3, 8.5, 9),
             (12, 'UK', 'London', None, 10, 7, None)]

c.executemany("INSERT INTO house_price VALUES (?,?,?,?,?,?,?)", house_price)

c.execute('SELECT * FROM house_price')

results = c.fetchall()

for result in results:
  print(result)


(1, 'USA', 'LA', 1000000.0, 1.0, 3.0, 5.0)
(2, 'UK', 'London', 400000.0, None, 5.0, 7.0)
(3, 'USA', 'LA', 850000.0, 9.0, None, None)
(4, 'USA', 'LA', None, 12.0, 4.0, 9.0)
(5, 'USA', 'LA', 900000.0, 2.0, 6.0, 1.0)
(6, 'UK', 'London', 550000.0, None, 4.0, 8.0)
(7, 'USA', 'LA', 1000000.0, 8.0, 8.0, 8.0)
(8, 'UK', 'London', 400000.0, 1.0, 4.0, 9.0)
(9, 'USA', 'LA', 850000.0, 4.0, 4.0, 5.0)
(10, 'USA', 'LA', 1050000.0, None, None, None)
(11, 'USA', 'LA', 900000.0, 3.0, 8.5, 9.0)
(12, 'UK', 'London', None, 10.0, 7.0, None)


In [55]:
c.execute('''
  SELECT 
    h.country, 
    h.city,
    COALESCE(h.price, n.newprice) AS price_new
  FROM house_price h, (SELECT s.city, AVG(s.price) AS newprice
        FROM house_price s
        GROUP BY s.city) n
  WHERE h.city = n.city
''')

results = c.fetchall()

for result in results:
  print(result)

('USA', 'LA', 1000000.0)
('UK', 'London', 400000.0)
('USA', 'LA', 850000.0)
('USA', 'LA', 935714.2857142857)
('USA', 'LA', 900000.0)
('UK', 'London', 550000.0)
('USA', 'LA', 1000000.0)
('UK', 'London', 400000.0)
('USA', 'LA', 850000.0)
('USA', 'LA', 1050000.0)
('USA', 'LA', 900000.0)
('UK', 'London', 450000.0)


`Note: Instead of AVG(), use functions such as MAX(), MIN() as necessary`