<a href="https://colab.research.google.com/github/SupunGurusinghe/sqlite-plus-colab/blob/main/sg_project1_dimensionally_structured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dimensionally Structured Columns**

### **SQLite table creation**

### **Background**

> **Scenario:** There is database table `personal` to store personal data. Column names and their data types are mentioned below.

**Special words**
* `personal`Table name
* `id` Personal id (primary key)
*  `name` Name of person
* `birthdate` Birthday of person
* `age` Age of person

---
**Data types**
* `id` Integer
*  `name` Varchar(30)
* `birthdate` Date
* `age` Integer

---
**Output:** All the set of rows of table



In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('test_database')
c = conn.cursor()

# dropping an existing table
c.execute("DROP TABLE IF EXISTS personal")

c.execute('''CREATE TABLE personal (
  id INT, 
  name VARCHAR(30),
  birthdate DATE,
  age INT)
''')

data = [[1, 'John', '03/01/1999', 23], 
        [2, 'Peter', '02/22/2000', 20], 
        [3, 'Emmly', '12/05/1995', 23],
        [4, 'Json', '05/17/1998', 24],
        [5, 'Rupa', '01/01/1999', 21],
        [6, 'Mill', '06/19/1980', 42]]

personal = pd.DataFrame(data, columns=['id', 'name', 'birthdate', 'age'])


conn.commit()

personal.to_sql('personal', conn, if_exists='replace', index = False)

c.execute('''
  SELECT *
  FROM personal
''')

for row in c.fetchall():
  print(row)

(1, 'John', '03/01/1999', 23)
(2, 'Peter', '02/22/2000', 20)
(3, 'Emmly', '12/05/1995', 23)
(4, 'Json', '05/17/1998', 24)
(5, 'Rupa', '01/01/1999', 21)
(6, 'Mill', '06/19/1980', 42)


In [2]:
c.close()

#### **Convert table to dataframe**

In [3]:
def create_df(table_name):
  """
    Convert all values of a table in to a dataframe

    Parameters
    ----------
    table_name : str
        Table name to be considered

    Variables
    ----------
    names: list of tuples
        All the column names of {table_name}
    result_list: list
        List of column names of {table_name}
    df: dataframe
        All the rows and columns of {table_name}

    Returns
    -------
    df: dataframe
        All the rows and columns of {table_n}
  """
  c = conn.cursor()
  # create dataframe from a table
  c.execute("SELECT name FROM pragma_table_info(?) ORDER BY cid", [table_name])
  names = c.fetchall()

  result_list = []
  for name in names:
    result_list.append(name[0])

  c.execute(f'SELECT * FROM {table_name}')
  results = c.fetchall()

  df = pd.DataFrame(results, columns= result_list)
  c.close()
  return df



### **Calculate time difference with respect to current date**

In [4]:
from datetime import datetime
import numpy as np
from dateutil.relativedelta import relativedelta

def datediff(table_name, date_col):
  """
    Take date difference with the current date

    Parameters
    ----------
    table_name : str
        Table name to be considered
    date_col : str
        Column name to be considered

    Variables
    ----------
    curr_time: datetime
        Current date and time
    df: dataframe
        All the rows and columns of {table_name}

    Returns
    -------
    df: dataframe
        All the rows and columns of {table_n} and additional date difference 
        column in years
  """
  c = conn.cursor()
  df = create_df(table_name)

  df[date_col] = pd.to_datetime(df[date_col], infer_datetime_format=True)
  curr_time = pd.to_datetime("now")
  df['date_diff_yrs'] =  np.floor((curr_time - df[date_col]) / np.timedelta64(1, 'Y'))
  
  c.close()
  return df

In [5]:
# Function calling
c = conn.cursor()

table_name = 'personal'
date_col = 'birthdate'
df = datediff(table_name, date_col)
df


Unnamed: 0,id,name,birthdate,age,date_diff_yrs
0,1,John,1999-03-01,23,23.0
1,2,Peter,2000-02-22,20,22.0
2,3,Emmly,1995-12-05,23,26.0
3,4,Json,1998-05-17,24,24.0
4,5,Rupa,1999-01-01,21,23.0
5,6,Mill,1980-06-19,42,42.0


In [6]:
c.close()

## **Creating table personal_n**

### **Background**

> **Scenario:** There is database table `personal_n` to store personal data. Column names and their data types are mentioned below.

**Special words**
* `personal_n`Table name
* `id` Personal id (primary key)
*  `name` Name of person
* `birthdate` Birthday of person
* `age` Age of person

---
**Data types**
* `id` Integer
*  `name` Varchar(30)
* `birthdate` Date
* `age` Integer

---
**Output:** All the set of rows of table



In [40]:
c = conn.cursor()

# dropping an existing table
c.execute("DROP TABLE IF EXISTS personal_n")

c.execute('''CREATE TABLE personal_n (
  id INT, 
  name VARCHAR(30),
  birthdate DATE,
  age INT)
''')

data = [[1, 'John', '20/01/1999', 23], 
        [2, 'Peter', '02/22/2000', 20], 
        [3, 'Emmly', '12/05/1995', 23],
        [4, 'Json', '30/17/1998', 24],
        [5, 'Rupa', '01/01/1999', 21],
        [6, 'Mill', '06/19/1980', 42]]

personal_n = pd.DataFrame(data, columns=['id', 'name', 'birthdate', 'age'])


conn.commit()

personal_n.to_sql('personal_n', conn, if_exists='replace', index = False)


### **Check date format**

In [41]:
from dateutil.parser import parse

def is_date(date_str, fuzzy = False):
  """
    Checking string is string

    Parameters
    ----------
    date_str : str
        Date value
    fuzzy : boolean
        Always False

    Variables
    ----------
    None

    Returns
    -------
    date_str: str
        Date value
  """
  try:
      parse(date_str, fuzzy = fuzzy)
      return date_str

  except ValueError:
      pass

In [42]:
import datetime
from dateutil import parser

def date_struct(tab_name, col_name, format):
  """
    Take date difference with the current date

    Parameters
    ----------
    table_name: str
        Table name to be considered
    col_name: str
        Column name to be considered
    format: str
        Date format needed to check

    Variables
    ----------
    table_name: str
        Table name to be considered
    df: dataframe
        All the rows and columns of {table_name}
    date_group: date
        Dates by is_date function
    date_change: datetime
        Incorrectly formatted dates

    Returns
    -------
    None
  """
  table_name = tab_name
  df = create_df(table_name)
  i = 0
  while i < len(df[col_name]):
      date_group = is_date(str(df.loc[i, col_name]))
      if date_group == None:
        i = len(df[col_name])
        continue
      print(date_group)
      try:
        date_change = datetime.datetime.strptime(date_group, format)
        # print(date_change)
      except ValueError:
        print(f'Incorrectly formatted {col_name} on record {i}')
        pass
      i = i + 1

In [43]:
tab_name = 'personal_n'
col_name = 'birthdate'
format = '%m/%d/%y'

date_struct(tab_name, col_name, format)

20/01/1999
Incorrectly formatted birthdate on record 0
02/22/2000
Incorrectly formatted birthdate on record 1
12/05/1995
Incorrectly formatted birthdate on record 2
