In [2]:
# json_to_psql.ipynb: check datatype of TIMEID
# implement approprate input validation while updating dataframe in 2.2 (2)
# implement decorators? for get_zip_code(), get_month() and get_year()

In [3]:
from pyspark.sql import SparkSession, DataFrame

from pyspark.sql.functions import count, sum, col, when

from typing import Union

In [4]:
spark = SparkSession.builder \
    .appName('Query MySQL database: creditcard_capstone') \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # to see the entire contents of the DataFrame in the console output.

In [5]:
mysql_database_name = 'creditcard_capstone'
mysql_driver = 'com.mysql.jdbc.Driver'
mysql_url = f'jdbc:mysql://localhost:3306/{mysql_database_name}'
mysql_properties = {
                'user': 'root',
                'password': 'password'
}

In [8]:
customer_df = spark.read.jdbc(url=mysql_url, table='cdw_sapp_customer', properties=mysql_properties)
credit_df = spark.read.jdbc(url=mysql_url, table='cdw_sapp_credit_card', properties=mysql_properties)
branch_df = spark.read.jdbc(url=mysql_url, table='cdw_sapp_branch', properties=mysql_properties)

---

## 2.1 Transaction Details Module

1)    Used to display the transactions made by customers living in a given zip code for a given month and year. Order by day in descending order.

In [9]:
def get_zipcode() -> str:
    """
    Prompts the user to enter a valid 5-digit zipcode.

    Returns:
        str: The valid 5-digit zipcode entered by the user.
    """
    def validate_zipcode(zipcode: str) -> None:
        """
        Validates the format of the given zipcode.

        Args:
            zipcode (str): The zipcode to validate.

        Raises:
            ValueError: If the zipcode is not a 5-digit numeric value.
        """
        if len(zipcode) != 5 or not zipcode.isdigit():
            raise ValueError("Invalid zipcode. Please enter a 5-digit numeric value.")

    while True:
        try:
            zipcode = input("Zipcode: ")
            validate_zipcode(zipcode)
            break
        except ValueError as e:
            print(str(e))

    return zipcode


In [10]:
def get_month() -> int:
    """
    Prompts the user to enter a valid month.

    Returns:
        int: The valid month entered by the user (numeric value between 1 and 12).
    """
    def validate_month(month: int) -> None:
        """
        Validates the given month.

        Args:
            month (int): The month to validate.

        Raises:
            ValueError: If the month is not a numeric value between 1 and 12.
        """
        if not 1 <= month <= 12:
            raise ValueError("Invalid month. Please enter a numeric value between 1 and 12.")

    while True:
        try:
            month = int(input("Month: "))
            validate_month(month)
            break
        except ValueError as e:
            print(str(e))

    return month


In [11]:
def get_year() -> int:
    """
    Prompts the user to enter a valid year.

    Returns:
        int: The valid year entered by the user (4-digit numeric value).
    """
    def validate_year(year: int) -> None:
        """
        Validates the given year.

        Args:
            year (int): The year to validate.

        Raises:
            ValueError: If the year is not a 4-digit numeric value.
        """
        if len(str(year)) != 4 or not str(year).isdigit():
            raise ValueError("Invalid year. Please enter a 4-digit numeric value.")

    while True:
        try:
            year = int(input("Year: "))
            validate_year(year)
            break
        except ValueError as e:
            print(str(e))

    return year


In [12]:
def transactions_by_zip_month_year() -> None:
    """
    Displays transactions made by customers living in a given zip code for a given month and year,
    ordered by day in ascending order.
    """
    filters = (customer_df.CUST_ZIP == get_zipcode()) & \
              (credit_df.MONTH == get_month()) & \
              (credit_df.YEAR == get_year())

    result_df = credit_df.join(customer_df, credit_df.CUST_SSN == customer_df.SSN, 'left') \
                         .where(filters) \
                         .orderBy(credit_df.DAY)

    result_df.show()

---

2)    Used to display the number and total values of transactions for a given type.

In [13]:
def distinct_options(df: DataFrame, column_name: str) -> list[str]:
    """
    Get the distinct options from a specific column in the DataFrame.

    Args:
        df (DataFrame): The DataFrame containing the data.
        column_name (str): The name of the column to fetch distinct options from.

    Returns:
        list[str]: The list of distinct options from the specified column.

    """
    distinct_options = df.select(column_name).distinct().rdd.flatMap(lambda x: x).collect()
    return distinct_options


In [14]:
def get_valid_input(df: DataFrame, column_name: str) -> str:
    """
    Prompts the user to enter a valid input from the distinct options available in the specified column of a DataFrame.

    Args:
        df (DataFrame): The DataFrame containing the data.
        column_name (str): The name of the column to get valid input from.

    Returns:
        str: The valid input entered by the user.

    Raises:
        ValueError: If the DataFrame or column name is not provided.
    """
    valid_input_list = distinct_options(df=df, column_name=column_name)
    valid_options = ", ".join(valid_input_list)

    while True:
        input_value = input(f"{column_name} ({valid_options}): ")
        if input_value in valid_input_list:
            break
        print(f"Invalid {column_name}. Please enter a valid input from the given options: {valid_options}.")

    return input_value


In [15]:
def transaction_total_and_no_by_type() -> None:
    """
    Query transaction data based on the user-provided transaction type.

    Returns:
        None

    Raises:
        ValueError: If the input transaction type is not valid.

    """

    transaction_type = get_valid_input(credit_df, column_name= 'TRANSACTION_TYPE') 

    result_df = credit_df.filter(credit_df.TRANSACTION_TYPE == transaction_type) \
                             .select(count('TRANSACTION_ID').alias('Total no of transactions'), \
                                     sum('TRANSACTION_VALUE').alias('Sum of Transaction Values'))

    result_df.show()

---

3)    Used to display the total number and total values of transactions for branches in a given state.

In [16]:
def transaction_total_and_no_by_branch_on_state ()-> None:
    """
    Process transaction data by querying based on user input of state.

    Args:
        credit_df (DataFrame): The DataFrame containing the credit data.
        customer_df (DataFrame): The DataFrame containing the customer data.

    """

    state = get_valid_input(branch_df, column_name='BRANCH_STATE')

    result_df = credit_df.join(branch_df, credit_df.BRANCH_CODE == branch_df.BRANCH_CODE, 'left') \
                         .where(branch_df.BRANCH_STATE == 'MN') \
                         .groupBy(branch_df.BRANCH_CODE) \
                         .agg(count(credit_df.TRANSACTION_ID).alias('Transaction_Count'), \
                              sum(credit_df.TRANSACTION_VALUE).alias('Transaction_Sum'))

    result_df.show()

---

## 2.2 Customer Details Module

1) Used to check the existing account details of a customer.


In [17]:
def choose_one_from(available_options: list, msg: str = "Available Options") -> str:
    """
    Prompts the user to choose one option from the given list of available options.

    Args:
        available_options (list): The list of available options to choose from.
        msg (str, optional): The message to display when prompting the user for input. Defaults to "Available Options".

    Returns:
        str: The selected option entered by the user.

    Raises:
        ValueError: If the available_options list is empty.
    """
    if not available_options:
        raise ValueError("No options provided.")

    while True:
        selected_option = input(f"{msg} ({', '.join(available_options)}): ")
        if selected_option in available_options:
            break
        print(f"Invalid option. Please enter a valid option from the given list: {', '.join(available_options)}")

    return selected_option


In [18]:
def show_account_details() -> None:
    """
    Retrieves and displays account details based on user input.

    Returns:
        None
    """
    valid_column_names = customer_df.columns

    filter_column = choose_one_from(valid_column_names)
    filter_value = get_valid_input(df=customer_df, column_name=filter_column)

    result_df = customer_df.filter(customer_df[filter_column] == filter_value)
    result_df.show()


2) Used to modify the existing account details of a customer.

In [19]:
def update_dataframe() -> None:
    """
    Update values in a PySpark DataFrame based on a filter condition.

    Returns:
        None.

    """
    valid_column_names = customer_df.columns

    filter_column = choose_one_from(valid_column_names, msg = 'Select Column to filter with')
    filter_value = get_valid_input(df = customer_df, column_name = filter_column)

    column_to_update = choose_one_from(valid_column_names, msg = 'Column to update')
    new_value = input("Enter new value:") # can implement appropriate input validation here

    customer_df = customer_df.withColumn(
                                        column_to_update, 
                                        when(col(filter_column) == filter_value, new_value).otherwise(col(column_to_update)))
    

3) Used to generate a monthly bill for a credit card number for a given month and year.

In [20]:
def get_valid_16_digit_input() -> str:
    """
    Prompts the user to enter a valid 16-digit number.

    Returns:
        str: The valid 16-digit number entered by the user.
    """
    while True:
        input_value = input("Enter a 16-digit number: ")
        if input_value.isdigit() and len(input_value) == 16:
            break
        print("Invalid input. Please enter a 16-digit number.")

    return input_value


In [21]:
def generate_monthly_bill() -> DataFrame:
    """
    Generates a monthly bill based on user input.

    Returns:
        DataFrame: The resulting DataFrame containing the selected columns for the specified credit card number, month, and year.
    """
    input_cc_no = get_valid_16_digit_input()
    input_year = get_year()
    input_month = get_month()

    result_df = credit_df.filter((credit_df['CREDIT_CARD_NO'] == input_cc_no) &
                                (credit_df['MONTH'] == input_month) &
                                (credit_df['YEAR'] == input_year))

    return result_df.select(['TRANSACTION_ID', 'DAY', 'MONTH', 'YEAR', 'TRANSACTION_TYPE', 'TRANSACTION_VALUE'])


In [22]:
def print_transaction_summary(df: DataFrame) -> None:
    """
    Prints a summary of the transactions in the provided DataFrame.

    Args:
        df (DataFrame): The DataFrame containing the transactions.

    Returns:
        None
    """
    total_and_count = df.agg({'TRANSACTION_VALUE': 'SUM', 'TRANSACTION_ID': 'COUNT'}).first()
    total = total_and_count['sum(TRANSACTION_VALUE)']
    count = total_and_count['count(TRANSACTION_ID)']

    print(f'Summary: There were {count} transactions totaling $ {total}')

In [23]:
def display_monthly_bill() -> None:
    """
    Displays the monthly bill and transaction summary.

    Returns:
        None
    """
    result_df = generate_monthly_bill()
    print(result_df)
    print_transaction_summary(result_df)

4) Used to display the transactions made by a customer between two dates. Order by year, month, and day in descending order.

In [24]:
def get_date_input() -> tuple:
    """
    Prompts the user to enter a valid date in the format YYYY-MM-DD.

    Returns:
        tuple: A tuple containing the year, month, and day extracted from the input date.

    Raises:
        ValueError: If the date format is invalid.
    """
    while True:
        date_str = input("Enter a date (YYYY-MM-DD): ")
        try:
            year, month, day = map(int, date_str.split('-'))
            return year, month, day
        except ValueError:
            print("Invalid date format. Please enter a date in the format YYYY-MM-DD.")


In [25]:
def transaction_between_dates() -> None:
    """
    Displays transactions between two specified dates and prints a transaction summary.

    Returns:
        None
    """
    start_year, start_month, start_day = get_date_input()
    end_year, end_month, end_day = get_date_input()

    result_df = credit_df.filter((credit_df['DAY'].between(start_day, end_day)) &
                                 (credit_df['MONTH'].between(start_month, end_month)) &
                                 (credit_df['YEAR'].between(start_year, end_year)) &
                                 (credit_df['CREDIT_CARD_NO'] == get_valid_16_digit_input()))
    result_df.show()

    print_transaction_summary(result_df)

In [26]:
spark.stop()

---