In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import configparser
import pathlib
import logging
from datetime import datetime
import sys

In [2]:
ls ./..

[34mArchive[m[m/   [34mData[m[m/      [34mLogs[m[m/      README.md
[34mConfig[m[m/    [34mDataModel[m[m/ [34mNotebook[m[m/  [34mSOW[m[m/


In [3]:
ls ./../Data

AUS.xlsx  IND.csv   USA.csv


In [4]:
def getSource_File_Path(logger):
    logger.info("[INFO]: Reading configuration file.")
    conffilepath = '/Volumes/E-Study/Github/Data-Engineering-Vaccination-Metrics/Config/configfile.ini'
    configfile = pathlib.Path(conffilepath)
    
    if configfile.exists():
        logger.info("[INFO]: Configuration file: " + str(configfile))
        config_obj = configparser.ConfigParser()
        config_obj.read(configfile)
        try:
            srcfilepath = config_obj['parameters']['filepath']
            logger.info("[INFO]: Source file location: " + srcfilepath)
            return srcfilepath
        except KeyError as e:
            logger.error("[ERROR]: filepath parameter not found in the configfile.ini.")
            return 1
    else:
        logger.error("[ERROR]: configfile.ini is missing.")
        logger.error("[ERROR]: configfile.ini expected location: " + str(conffilepath))
        return 1

In [5]:
def getLog_File_Path(logger):
    conffilepath = '/Volumes/E-Study/Github/Data-Engineering-Vaccination-Metrics/Config/configfile.ini'
    configfile = pathlib.Path(conffilepath)
    if configfile.exists():
        logger.info("[INFO]: Configuration file: " + str(configfile))
        config_obj = configparser.ConfigParser()
        config_obj.read(configfile)
        try:
            logpath = config_obj['parameters']['logpath']
            logger.info("[INFO]: Log file location: " + logpath)
            return logpath
        except KeyError as e:
            logger.error("[ERROR]: logpath parameter not found in the configfile.ini.")
            return 1
    else:
        logger.error("[ERROR]: configfile.ini is missing.")
        return 1

In [6]:
def isValid_File_Type(f):
    if f.endswith('.xlsx') or f.endswith('.XLSX'):
        return True
    elif f.endswith('.csv') or f.endswith('.CSV'):
        return True
    elif f.startswith('.'):
        return True
    else:
        return False

In [7]:
def init_log():
    logger=logging.getLogger()
    logpath = getLog_File_Path(logger)
    logfile = logpath + str(datetime.now().strftime('%Y_%m_%d')) + '.log'
    logging.basicConfig(filename=logfile,
                        format='%(asctime)s %(message)s',
                        filemode='w',
                        force=True)
    return logger

In [8]:
def abort():
    sys.exit()
    

In [20]:
def job_Housekeeping(logger):
    logger.info("[INFO]: Starting the full feed process.")
    
    filepath = getSource_File_Path(logger)
    
    if filepath == 1:
        return 1
    
    logger.info("[INFO]: Validating the file types received from source.")
    files = os.listdir(filepath)
    files = [f for f in files if not f.startswith('.')]
    for f in files:
        if isValid_File_Type(f):
            logger.info("[INFO]: Valid file type. Filename is: " + f)
        else:
            logger.error("[ERROR]: Invalid file type received from source. Filename is: " + f)
            return 1
    return 0

In [21]:
logger = init_log()
logger.setLevel(logging.DEBUG)

jb_hkeep_rc = job_Housekeeping(logger)
if (jb_hkeep_rc):
    logger.error("[ERROR]: Aborting the housekeeping job.")
    abort()

In [11]:
df_aus = pd.read_excel('./../Data/AUS.xlsx')
df_aus.head()

Unnamed: 0,Unique ID,Patient Name,Vaccine Type,Date of Birth,Date of Vaccination
0,1,Mike,LMN,NaT,2022-05-11 00:00:00
1,2,Jonnathan,XYZ,1997-12-13,2021-13-13
2,3,Cristina,ABC,1998-03-12,2022-03-12 00:00:00


In [24]:
df_aus.isna().sum()

Unique ID              0
Patient Name           0
Vaccine Type           0
Date of Birth          1
Date of Vaccination    0
dtype: int64

In [25]:
df_ind = pd.read_csv('./../Data/IND.csv')
df_ind.head()

Unnamed: 0,ID,Name,DOB,VaccinationType,VaccinationDate,Free or Paid
0,1,Vikas,1998-12-01,XYZ,2022-01-01,F
1,2,Rahul,1982-08-13,ABC,2022-03-05,P
2,3,Sameer,1952-08-13,ABC,2022-02-20,F


In [26]:
df_ind.isna().sum()

ID                 0
Name               0
DOB                0
VaccinationType    0
VaccinationDate    0
Free or Paid       0
dtype: int64

In [27]:
df_usa = pd.read_csv('./../Data/USA.csv')
df_usa.head()

Unnamed: 0,ID,Name,VaccinationType,VaccinationDate
0,1,Sam,EFG,6152022
1,2,John,XYZ,1052022
2,3,Mike,ABC,12282021


In [28]:
df_usa.isna().sum()

ID                 0
Name               0
VaccinationType    0
VaccinationDate    0
dtype: int64