<a href="https://colab.research.google.com/github/SupunGurusinghe/sqlite-plus-colab/blob/main/sg_project1_dimensionally_structured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dimensionally Structured Columns**

### **SQLite table creation**

In [80]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('test_database')
c = conn.cursor()

# dropping an existing table
c.execute("DROP TABLE IF EXISTS personal")

c.execute('''CREATE TABLE personal (
  id INT, 
  name VARCHAR(30),
  birthdate DATE,
  age INT)
''')

data = [[1, 'John', '03/01/1999', 23], 
        [2, 'Peter', '02/22/2000', 20], 
        [3, 'Emmly', '12/05/1995', 23],
        [4, 'Json', '05/17/1998', 24],
        [5, 'Rupa', '01/01/1999', 21],
        [6, 'Mill', '06/19/1980', 42]]

personal = pd.DataFrame(data, columns=['id', 'name', 'birthdate', 'age'])


conn.commit()

personal.to_sql('personal', conn, if_exists='replace', index = False)

c.execute('''
  SELECT *
  FROM personal
''')

for row in c.fetchall():
  print(row)

(1, 'John', '03/01/1999', 23)
(2, 'Peter', '02/22/2000', 20)
(3, 'Emmly', '12/05/1995', 23)
(4, 'Json', '05/17/1998', 24)
(5, 'Rupa', '01/01/1999', 21)
(6, 'Mill', '06/19/1980', 42)


In [81]:
c.close()

#### **Convert table to dataframe**

In [82]:
def create_df(table_name):
  c = conn.cursor()
  # create dataframe from a table
  c.execute("SELECT name FROM pragma_table_info(?) ORDER BY cid", [table_name])
  names = c.fetchall()

  result_list = []
  for name in names:
    result_list.append(name[0])

  c.execute(f'SELECT * FROM {table_name}')
  results = c.fetchall()

  df = pd.DataFrame(results, columns= result_list)
  c.close()
  return df



### **Calculate time difference with respect to current date**

In [59]:
from datetime import datetime
import numpy as np
from dateutil.relativedelta import relativedelta

def datediff(table_name, date_col):
  c = conn.cursor()
  df = create_df(table_name)

  df[date_col] = pd.to_datetime(df[date_col], infer_datetime_format=True)
  curr_time = pd.to_datetime("now")
  df['date_diff_yrs'] =  np.floor((curr_time - df[date_col]) / np.timedelta64(1, 'Y'))
  
  c.close()
  return df

In [61]:
# Function calling
c = conn.cursor()

table_name = 'personal'
date_col = 'birthdate'
df = datediff(table_name, date_col)
df


Unnamed: 0,id,name,birthdate,age,date_diff_yrs
0,1,John,1999-03-01,23,23.0
1,2,Peter,2000-02-22,20,22.0
2,3,Emmly,1995-12-05,23,26.0
3,4,Json,1998-05-17,24,24.0
4,5,Rupa,1999-01-01,21,23.0
5,6,Mill,1980-06-19,42,42.0


In [62]:
c.close()

## **Creating table person_n**

In [111]:
c = conn.cursor()

# dropping an existing table
c.execute("DROP TABLE IF EXISTS personal_n")

c.execute('''CREATE TABLE personal_n (
  id INT, 
  name VARCHAR(30),
  birthdate DATE,
  age INT)
''')

data = [[1, 'John', '20/01/1999', 23], 
        [2, 'Peter', '02/22/2000', 20], 
        [3, 'Emmly', '12/05/1995', 23],
        [4, 'Json', '30/17/1998', 24],
        [5, 'Rupa', '01/01/1999', 21],
        [6, 'Mill', '06/19/1980', 42]]

personal_n = pd.DataFrame(data, columns=['id', 'name', 'birthdate', 'age'])


conn.commit()

personal_n.to_sql('personal_n', conn, if_exists='replace', index = False)


### **Check date format**

In [112]:
from dateutil.parser import parse

def is_date(date_str, fuzzy = False):
    try:
        parse(date_str, fuzzy = fuzzy)
        return date_str

    except ValueError:
        pass

In [147]:
import datetime

def date_struct(tab_name, col_name, format):
  table_name = tab_name
  df = create_df(table_name)
  i = 0
  while i < len(df[col_name]):
      date_group = is_date(str(df.loc[i, col_name]))
      if date_group == None:
        i = len(df[col_name])
        continue
      print(date_group)
      try:
        date_change = datetime.datetime.strptime(date_group, format).date()
        print(date_change)
      except ValueError:
        print(f'Incorrect {col_name} on column {i}')
        pass
      i = i + 1

In [148]:
tab_name = 'personal_n'
col_name = 'birthdate'
format = '%m/%d/%y'

date_struct(tab_name, col_name, format)

20/01/1999
Incorrect birthdate on column 0
02/22/2000
Incorrect birthdate on column 1
12/05/1995
Incorrect birthdate on column 2
