# LAB VISUAL ETL TRANSFORMATION SCRIPT

## Import Required Libraries

In [1]:
import os
import pyodbc
import pandas as pd
from sqlalchemy import create_engine
import time

## Mysql Server Configuration

In [None]:
default_mysql_config = {
    "MYSQL_USER": "{mysql_user}",
    "MYSQL_PASSWORD": "{mysql_pasword}",
    "MYSQL_HOST": "localhost",
    "MYSQL_DATABASE_NAME": "labdashdb",
    "MYSQL_PORT": "3306",
}

mysql_config = {}
for key, default_value in default_mysql_config.items():
    user_input = input(f"Enter {key} (default is {default_value}): ").strip()
    mysql_config[key] = user_input if user_input else default_value

mysql_url = f"mysql+pymysql://{mysql_config['MYSQL_USER']}:{mysql_config['MYSQL_PASSWORD']}@{mysql_config['MYSQL_HOST']}:{mysql_config['MYSQL_PORT']}/{mysql_config['MYSQL_DATABASE_NAME']}"

try:
    mysql_conn = create_engine(mysql_url)
    print(f"Connected to MySQL database: {mysql_config['MYSQL_DATABASE_NAME']}")
except Exception as e:
    print(f"Error connecting to MySQL: {e}")
    exit(1)

Connected to MySQL database: labdashdb


## Mssql Server Configuration

In [None]:
mssql_config = {
    "server": "localhost",
    "database": "master",
    "username": "{mssql_user}",
    "password": "{mssql_pasword}",
}

mssql_conn = pyodbc.connect(
        f"DRIVER={{ODBC Driver 17 for SQL Server}};"
        f"SERVER={mssql_config['server']};"
        f"DATABASE={mssql_config['database']};"
        f"UID={mssql_config['username']};"
        f"PWD={mssql_config['password']};"
        f"Trusted_Connection=yes;",
        autocommit=True
    )

mssql_cursor = mssql_conn.cursor()

## Create lab_visual_analysis database and its schemas

In [4]:
mssql_cursor.execute(
    f"""
        IF EXISTS (SELECT * FROM sys.databases WHERE name = 'lab_visual_analysis') 
        BEGIN
            DROP DATABASE lab_visual_analysis;
        END
        CREATE DATABASE lab_visual_analysis;
    """
)
mssql_conn.commit()
print("MSSQL database lab_visual_analysis dropped and recreated successfully.")

mssql_cursor.execute("USE lab_visual_analysis;")

schemas = ['source', 'derived', 'final', 'z', 'dbo']
for schema in schemas:
    mssql_cursor.execute(
        f"""
            IF NOT EXISTS (SELECT * FROM sys.schemas WHERE name = '{schema}')
            BEGIN
                EXEC('CREATE SCHEMA {schema}');
            END
        """
    )
mssql_conn.commit()
print("Schemas created successfully.")

MSSQL database lab_visual_analysis dropped and recreated successfully.
Schemas created successfully.


## Create Tables in the lab_visual_analysis database

In [5]:
def create_tables():
    table_creation_queries = [
        """
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'tbl_Facilities' AND schema_id = SCHEMA_ID('source'))
        CREATE TABLE source.tbl_Facilities (
            Id INT IDENTITY(1,1),
            HfrCode NVARCHAR(50) PRIMARY KEY,
            Name NVARCHAR(255),
            Region NVARCHAR(255),
            District NVARCHAR(255),
            Council NVARCHAR(255)
        );
        """,
        """
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'tbl_Device_Logs' AND schema_id = SCHEMA_ID('source'))
        CREATE TABLE source.tbl_Device_Logs (
            Id INT IDENTITY(1,1) PRIMARY KEY,
            DeviceName NVARCHAR(255),
            DeviceCode NVARCHAR(50),
            DateBrokenDown DATETIME2,
            DateReported DATETIME2,
            DateFixed DATETIME2,
            BreakDownReason NVARCHAR(255)
        );
        """,
        """
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'tbl_Commodity_Transactions' AND schema_id = SCHEMA_ID('source'))
        CREATE TABLE source.tbl_Commodity_Transactions (
            Id INT IDENTITY(1,1) PRIMARY KEY,
            CommodityName NVARCHAR(255),
            CommodityCode NVARCHAR(50),
            BatchNumber NVARCHAR(50),
            TransactionDate DATETIME2,
            ExpireDate DATETIME2,
            TransactionType NVARCHAR(50),
            TransactionQuantity INT
        );
        """,
        """
        IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'tbl_Sample' AND schema_id = SCHEMA_ID('source'))
        CREATE TABLE source.tbl_Sample (
            Id INT IDENTITY(1,1) PRIMARY KEY,
            sampletrackingid NVARCHAR(50),
            LabHfrCode NVARCHAR(50),
            HubHfrCode NVARCHAR(50),
            EntryModality NVARCHAR(50),
            SampleType NVARCHAR(255),
            TestName NVARCHAR(255),   
            SampleQualityStatus NVARCHAR(50),
            Results NVARCHAR(255),
            SampleRejectionReason NVARCHAR(255),
            DeviceName NVARCHAR(255),
            DeviceCode NVARCHAR(50),
            CollectionDate DATETIME2,
            ReceivedDate DATETIME2,
            TestDate DATETIME2,
            AuthorisedDate DATETIME2,
            DispatchDate DATETIME2
        );
        """
    ]

    for query in table_creation_queries:
        try:
            mssql_cursor.execute(query)
        except Exception as e:
            print(f"Error creating table: {e}")

    mssql_conn.commit()
    print("Tables created successfully.")

## Load Facility Data

In [6]:
def load_facility_data():
    excel_file = "./All_Operating_Health_Facilities_in_Tanzania-Lab-Visual-2021oct22.xlsx"
    
    if not os.path.exists(excel_file):
        print(f"Excel file '{excel_file}' not found.")
        return
    
    df = pd.read_excel(excel_file)

    df.rename(columns={"Facility Number": "HfrCode", "Facility Name": "Name"}, inplace=True)
    
    df = df[['HfrCode', 'Name', 'Region', 'District', 'Council']]
    df['Region'] = df['Region'].str.replace("Region", "", regex=True).str.strip()
    
    df.drop_duplicates(subset=['HfrCode'], keep='first', inplace=True)
    
    insert_query = """
    INSERT INTO source.tbl_Facilities (HfrCode, Name, Region, District, Council)
    VALUES (?, ?, ?, ?, ?)
    """
    
    inserted_rows = 0
    try:
        for _, row in df.iterrows():
            mssql_cursor.execute(insert_query, row['HfrCode'], row['Name'], row['Region'], row['District'], row['Council'])
        mssql_conn.commit()
        inserted_rows = len(df)
        print(f"{inserted_rows} rows inserted into tbl_Facilities.")
    except Exception as e:
        print(f"Error inserting data: {e}")
        mssql_conn.rollback()

## Load Device Logs Data

In [7]:
def load_device_log_data():
    query = """
        select
            deviceName as DeviceName,
            deviceCode as DeviceCode,
            dateBreakDown as DateBrokenDown,
            dateReported as DateReported,
            dateFixed as DateFixed,
            breakDownReason as BreakDownReason
        from
            instrumentlogs2
    """
    
    try:
        device_logs = pd.read_sql(query, mysql_conn)
        
        insert_query = """
        INSERT INTO source.tbl_Device_Logs (DeviceName, DeviceCode, DateBrokenDown, DateReported, DateFixed, BreakDownReason)
        VALUES (?, ?, ?, ?, ?, ?)
        """
        
        for _, row in device_logs.iterrows():
            mssql_cursor.execute(insert_query, row['DeviceName'], row['DeviceCode'], row['DateBrokenDown'], row['DateReported'], row['DateFixed'], row['BreakDownReason'])
        
        mssql_conn.commit()
        print(f"{len(device_logs)} rows inserted into tbl_Device_Logs.")
    except Exception as e:
        print(f"Error fetching or inserting device logs: {e}")
        mssql_conn.rollback()

## Load Commodity Transaction Data

In [8]:
def load_commodity_transaction_data():
    query = """SELECT commodityName AS CommodityName, commodityCode AS CommodityCode, batchNo AS BatchNumber, transactionDate AS TransactionDate, 
                    expireDate AS ExpireDate, transactionType AS TransactionType, quantity AS TransactionQuantity FROM commoditytransactions"""
    
    try:
        commodity_transactions = pd.read_sql(query, mysql_conn)
        
        insert_query = """
        INSERT INTO source.tbl_Commodity_Transactions (CommodityName, CommodityCode, BatchNumber, TransactionDate, ExpireDate, TransactionType, TransactionQuantity)
        VALUES (?, ?, ?, ?, ?, ?, ?)
        """
        
        for _, row in commodity_transactions.iterrows():
            mssql_cursor.execute(insert_query, row['CommodityName'], row['CommodityCode'], row['BatchNumber'], row['TransactionDate'],
                                row['ExpireDate'], row['TransactionType'], row['TransactionQuantity'])
        
        mssql_conn.commit()
        print(f"{len(commodity_transactions)} rows inserted into tbl_Commodity_Transactions.")
    
    except Exception as e:
        print(f"Error fetching or inserting commodity transactions: {e}")
        mssql_conn.rollback()

## Load Sample Data

In [9]:
def load_sample_data():
    query = """
    SELECT DISTINCT trackingID as sampletrackingid, 
                    facilityHfrID, 
                    sampleType as SampleType, 
                    testName as TestName, 
                    sampleQuality as SampleQualityStatus, 
                    rejectionReason as SampleRejectionReason, 
                    sampleCollectionDate as CollectionDate, 
                    dateReceivedLab as ReceivedDate, 
                    results as Results, 
                    testedDate as TestDate, 
                    resultAuthorisedDate as AuthorisedDate, 
                    resultAuthorisedDate as DispatchDate,
                    testInstrument as DeviceName,
                    NULL as DeviceCode,
                    IF(SUBSTR(trackingID, 1, 4) = 'BC03', 'lab', 'hub') as EntryModality
    FROM tbl_labtests
    WHERE sampleCollectionDate >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR dateSentLab >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR dateReceivedLab >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR registeredDate >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR testedDate >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR resultAuthorisedDate >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    OR dateResultSentHub >= DATE_SUB(CURDATE(), INTERVAL 2 MONTH)
    """
    
    try:
        sample_data = pd.read_sql(query, mysql_conn)
        if sample_data.empty:
            print("No sample data found.")
            return

        def extract_hfr_code(row):
            # base_code = row['SampleTrackingId'][:6] 
            # formatted_code = f"{base_code}-{row['SampleTrackingId'][6]}"
            
            if row['EntryModality'] == 'lab':
                return row['facilityHfrID'], None
            else:
                return None, row['facilityHfrID']

        sample_data[['LabHfrCode', 'HubHfrCode']] = sample_data.apply(extract_hfr_code, axis=1, result_type='expand')

        for _, row in sample_data.iterrows():
            insert_query = """
            INSERT INTO source.tbl_Sample (sampletrackingid, LabHfrCode, HubHfrCode, EntryModality, SampleType, 
                                        TestName, SampleQualityStatus, Results, SampleRejectionReason, DeviceName, 
                                        DeviceCode, CollectionDate, ReceivedDate, TestDate, AuthorisedDate, DispatchDate)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """
            mssql_cursor.execute(insert_query, row['sampletrackingid'], row['LabHfrCode'], row['HubHfrCode'], 
                                row['EntryModality'], row['SampleType'], row['TestName'], row['SampleQualityStatus'], 
                                row['Results'], row['SampleRejectionReason'], row['DeviceName'], row['DeviceCode'], row['CollectionDate'], 
                                row['ReceivedDate'], row['TestDate'], row['AuthorisedDate'], row['DispatchDate'])
        
        mssql_conn.commit()
        print(f"{len(sample_data)} rows processed for tbl_Sample.")

    except Exception as e:
        mssql_conn.rollback()
        print(f"Error inserting data: {e}")

## Read the stored procedures file

In [10]:
def read_stored_procedures_file():
    sql_file = "./create-stored-procedures.sql"
    if os.path.exists(sql_file):
        with open(sql_file, "r") as file:
            sql_commands = file.read()

        sql_batches = sql_commands.split("GO")

        for batch in sql_batches:
            batch = batch.strip()

            if not batch or batch.startswith("--") or batch.startswith("/*"):
                continue

            if batch:
                try:
                    mssql_cursor.execute(batch)
                    mssql_conn.commit()
                except Exception as e:
                    print(f"Error loading batch: {e}")
                    mssql_conn.rollback()
            
        print("Stored procedures loaded successfully.")
    else:
        print(f"SQL file '{sql_file}' not found.")

## Execute Data Transformation

In [11]:
def execute_stored_procedures():
    try:
        sp_main = ["dbo.sp_data_processing", "z.sp_data_processing", "derived.sp_data_processing", "final.sp_data_processing"]
        for sp in sp_main:
            sp = f"EXEC {sp}"
            print(f"Executing: {sp}")
            mssql_cursor.execute(sp)
            
            while mssql_cursor.nextset():
                pass

            print(f"Stored procedure {sp} executed successfully.")
        mssql_conn.commit()

    except Exception as e:
        print(f"Error executing stored procedure: {e}")
        mssql_conn.rollback()
    finally:
        if 'mssql_conn' in locals():
            mssql_conn.close()
            print("Database connection closed.")


In [None]:
create_tables()
load_facility_data()
load_sample_data()
load_device_log_data()
load_commodity_transaction_data()
read_stored_procedures_file()
execute_stored_procedures()

Tables created successfully.
9884 rows inserted into tbl_Facilities.
8981 rows processed for tbl_Sample.
0 rows inserted into tbl_Device_Logs.
0 rows inserted into tbl_Commodity_Transactions.
Stored procedures loaded successfully.
Executing: EXEC dbo.sp_data_processing
Stored procedure EXEC dbo.sp_data_processing executed successfully.
Executing: EXEC z.sp_data_processing
Stored procedure EXEC z.sp_data_processing executed successfully.
Executing: EXEC derived.sp_data_processing
Stored procedure EXEC derived.sp_data_processing executed successfully.
Executing: EXEC final.sp_data_processing
Stored procedure EXEC final.sp_data_processing executed successfully.
