#### ETL Processes

> I would be using this notebook to develop the ETL process for each of the database tables before completing the etl.py file to load the whole datasets.

import libraries

In [1]:
# !pip install psycopg2-binary

In [2]:
import os
import glob
import psycopg2
import pandas as pd 
import sys
import numpy as np
from dotenv import load_dotenv

In [3]:
load_dotenv('../.env')

True

create a database connection

In [4]:
def get_connection(user:str, host:str,database:str, port:str=None, password:str= None) -> psycopg2.extensions.cursor:
    try:
        
            if password is None:
        
                password = os.getenv('PASSWORD')
            
            connection = psycopg2.connect(user = user,
                                          password = password,
                                          host = host,
                                          port = port,
                                          database = database)
            # perform autocommit on queries
            connection.set_session(autocommit=True)
            cursor = connection.cursor()
            # Print PostgreSQL Connection properties
            print ( connection.get_dsn_parameters(),"\n")

            # Print PostgreSQL version
            cursor.execute("SELECT version();")
            record = cursor.fetchone()
            print("You are connected to - ", record,"\n") 
    except (Exception, psycopg2.Error) as error :
            print ("Error while connecting to PostgreSQL", error)
            sys.exit(1)
    return connection

In [5]:
connection = get_connection(user='postgres', host='127.0.0.1',
              database='moodle')

{'user': 'postgres', 'dbname': 'moodle', 'host': '127.0.0.1', 'port': '5432', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 

You are connected to -  ('PostgreSQL 12.4 (Ubuntu 12.4-1.pgdg18.04+1) on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0, 64-bit',) 



> define function to execute database queries

In [6]:
def execute_query(query, con, fetch_type:int):
    
    """
    fetch_type:
        1 -> fetchone
        2 -> fetchmany
        3 -> fetchall
    """
    
    try:
        cur = con.cursor() 
        cur.execute(query)
        
        if fetch_type == 1:
            result = cur.fetchone()
        if fetch_type == 2:
            result == cur.fetchmany()
        if fetch_type == 3:
            result = cur.fetchall()
            for row in result:
                print(row)
            
            
    except psycopg2.DatabaseError as e:
        print(f'Error {e}')
        sys.exit(1)
    
    return result

In [7]:
# fetch_tables = ("select * from mdl_course;")
# execute_query(query=fetch_tables, con=connection, fetch_type=3)[0]

#### Tables to be queried in the mooodle database

● mdl_logstore_standard_log -*

● mdl_context *

● mdl_user *

● mdl_course *

● mdl_modules *

● mdl_course_modules *

● mdl_course_modules_completion *

● mdl_grade_items

● mdl_grade_grades *

● mdl_grade_categories

● mdl_grade_items_history

● mdl_grade_grades_history

● mdl_grade_categories_history

● mdl_forum

● mdl_forum_discussions

● mdl_forum_posts

> create a list to store tables names

In [8]:
moodle_tables = ['mdl_logstore_standard_log',
     
'mdl_context',
     
'mdl_user',
     
'mdl_course ',
     
'mdl_modules',
     
'mdl_course_modules ',
     
'mdl_course_modules_completion',
     
'mdl_grade_items',
     
'mdl_grade_grades',

'mdl_grade_categories',

'mdl_grade_items_history',

'mdl_grade_grades_history',

'mdl_grade_categories_history',
     
'mdl_forum',
     
'mdl_forum_discussions',
     
'mdl_forum_posts'
    ]

> write function to parse sql tables to pandas

In [9]:
def get_tables(table_names:list, con) -> dict:
    
    tables = dict()
    for table in table_names:
        
        table_key = table.split('_')[-1]
        
        print("getting table for "+table_key)
        
        tables[table_key] = pd.read_sql_query(sql =f'SELECT * FROM {table}', con=con)
        
    return tables

In [10]:
tables = get_tables(table_names=moodle_tables, con=connection)

getting table for log
getting table for context
getting table for user
getting table for course 
getting table for modules
getting table for modules 
getting table for completion
getting table for items
getting table for grades
getting table for categories
getting table for history
getting table for history
getting table for history
getting table for forum
getting table for discussions
getting table for posts


In [11]:
type(tables)

dict

In [12]:
tables.keys()

dict_keys(['log', 'context', 'user', 'course ', 'modules', 'modules ', 'completion', 'items', 'grades', 'categories', 'history', 'forum', 'discussions', 'posts'])

> write function to save table objects to csv

In [21]:
saved_tables_path = '../data/retrieved_tables'

In [29]:
def save_tables(path:str, tables:dict, index:bool=False, date_format:str='%Y%m%d'):
    for table_name, table in tables.items(): 
        file_name= f"/{table_name}.csv"
        print(f"Saving sql table {table_name} to -> "+path+file_name)
        table.to_csv(path_or_buf=path+file_name, index=index, date_format=date_format)

In [30]:
save_tables(path=saved_tables_path, tables=tables)

Saving sql table log to -> ../data/retrieved_tables/log.csv
Saving sql table context to -> ../data/retrieved_tables/context.csv
Saving sql table user to -> ../data/retrieved_tables/user.csv
Saving sql table course  to -> ../data/retrieved_tables/course .csv
Saving sql table modules to -> ../data/retrieved_tables/modules.csv
Saving sql table modules  to -> ../data/retrieved_tables/modules .csv
Saving sql table completion to -> ../data/retrieved_tables/completion.csv
Saving sql table items to -> ../data/retrieved_tables/items.csv
Saving sql table grades to -> ../data/retrieved_tables/grades.csv
Saving sql table categories to -> ../data/retrieved_tables/categories.csv
Saving sql table history to -> ../data/retrieved_tables/history.csv
Saving sql table forum to -> ../data/retrieved_tables/forum.csv
Saving sql table discussions to -> ../data/retrieved_tables/discussions.csv
Saving sql table posts to -> ../data/retrieved_tables/posts.csv
