#### Data Quality

In [None]:
import pandas as pd 

In [None]:
df = pd.read_csv('data/sales_data.csv') 

In [None]:
# Calculate total sales
total_sales = df['sales'].sum() 

Improve Code to Avoid Pitfalls of Data Quality Assumption

In [None]:
df['sales'] = pd.to_numeric(df['sales'], errors='coerce')  # 'coerce' converts invalid values to NaN 

In [None]:
# Calculate total sales
total_sales = df['sales'].sum() 

### Implementing Logging in Python

In [1]:
# Import Built-In Logging Module in Python
import logging

In [None]:
# Create a logger 
logger = logging.getLogger('etl_logger') 

# Set the level of the logger (DEBUG, INFO, WARNING, ERROR, CRITICAL) 
logger.setLevel(logging.INFO) 


# Create a file handler to log messages to a file 
file_handler = logging.FileHandler('etl_log.log') 


# Create a formatter 
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 


# Add the formatter to the handler 
file_handler.setFormatter(formatter) 
 

# Add the handler to the logger 
logger.addHandler(file_handler) 

In [None]:
# An example of using the logger 
def etl_process(): 

    try: 
        logger.info('Starting ETL process') 

        # Your ETL code here... 
        
        # for example:
        
        # extract()
        # transform()
        # load()

        logger.info('ETL process completed successfully') 

    except Exception as e: 

        logger.error('ETL process failed', exc_info=True) 

### Checkpoint for Recovery

In [None]:
# Sample Extract Logging
def extract(): 
    logger.info('Starting extraction') 
    	# extraction code here... 
    logger.info('Extraction completed') 

In [None]:
# Sample Transform Logging
def transform(): 
    logger.info('Starting transformation') 
    	# transformation code here... 
    logger.info('Transformation completed') 

In [None]:
# Sample Load Logging
def load(): 
    logger.info('Starting load') 
    	# loading code here... 
    logger.info('Load completed') 

In [None]:
# Sample ETL with Logging
def etl_process(): 

    try: 
        extract() 
        transform() 
        load() 
        logger.info('ETL process completed successfully') 
    except Exception as e: 
        logger.error('ETL process failed', exc_info=True) 

In [None]:
# run ETL
etl_process() 

### Avoiding Single Points of Failure

In [None]:
# Source 1 is the primary source
def extract_from_source1(): 
    # Your extraction code here... 
    pass 

# Source 2 is the redundant source
def extract_from_source2(): 
    # Your extraction code here... 

    pass 

In [None]:
def extract(): 

    try: 
        logger.info('Starting extraction from Source 1') 
        extract_from_source1() 
        logger.info('Extraction from Source 1 completed') 

    except Exception as e: 
        logger.error('Failed to extract from Source 1', exc_info=True) 

        try: 
            logger.info('Starting extraction from Source 2') 
            extract_from_source2() 
            logger.info('Extraction from Source 2 completed') 

        except Exception as e: 
            logger.error('Failed to extract from Source 2', exc_info=True) 
            raise 