In [1]:
%run /home/jovyan/work/database_operations/spark_db_connection.ipynb import SparkPostgresConnection
%run /home/jovyan/work/database_operations/db_operations.ipynb import DataOperations

In [2]:
import logging
def set_logging():
    """
    Set up logging settings
    """
    try:
        logger = logging.getLogger()
        if logger.hasHandlers():
            logger.handlers.clear()
        logging.basicConfig(level=logging.INFO, format='%(message)s')
    except Exception as e:
        logging.error(f"Error in logging set up: {e}")
        raise e
set_logging()

In [3]:
from typing import List
from pyspark.sql import SparkSession

class DataExtraction:
    def __init__(self, spark: SparkSession, path: str, table_name: str) -> None:
        """
        Args:
            spark: Active SparkSession
            list_of_paths: List of data paths for extraction
            list_of_names: List of table names for saving into database
        """
        self.spark = spark
        self.path = path
        self.table_name = table_name
        self.database_operations = DataOperations(self.spark)

    def save_to_database(self) -> None:
        """
        Saves data as into database
        """
        try:
            table_to_save = self.database_operations.ingest_data(self.path)
            self.database_operations.save_data(table_to_save, self.table_name)
        except Exception as e:
            logging.error(f"Error in data extraction: {e}")
            raise e
    

In [5]:
class ExtractionParamaters:
    """
    Class for defining data sources and table names
    """
    list_of_paths: List[str] = ["/home/jovyan/work/dataset/goalscorers.csv", 
                                "/home/jovyan/work/dataset/results.csv", 
                                "/home/jovyan/work/dataset/shootouts.csv"]
    list_of_names: List[str] = ["scorers", "results", "shootouts"]


In [6]:
from typing import Type
def extract_data(spark: SparkSession, config: Type[ExtractionParamaters] = ExtractionParamaters) -> None:
    """
    Extracts the data and saves into database

    Args:
        spark: Active SparkSession
        config: Class for defining parameters
    """
    try:
        data_extraction = DataExtraction(spark, config.list_of_paths, config.list_of_names)
        data_extraction.save_to_database()
        logging.info("Successfully saved the data into database, extracting process completed.")
    except Exception as e:
        logging.error(f"Error in data extraction: {e}")
        raise e