<a href="https://colab.research.google.com/github/PRISM-SING-HEALTH/2025-NCP-Projects/blob/main/NCP_Variant_DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

This section contains Python standard imports required for the program.



In [None]:
import pandas as pd
import numpy as np
import yaml

This section contains Python imports used for unit testing and are not required for the main program.

In [None]:
import unittest
from unittest.mock import patch, MagicMock
import tempfile
import os

# functions.py

This section contains Python functions along with their documentation, that are used in the main program.

In [None]:
def load_config(config_file):
    """
    Loads configuration from a YAML file and validates required keys.

    Args:
        config_file (str): Path to the YAML configuration file.

    Returns:
        dict: Configuration settings.

    Raises:
        FileNotFoundError: If the YAML configuration file is not found.
        ValueError: If the YAML file cannot be parsed or required keys are missing.
    """
    try:
        with open(config_file, 'r') as file:
            config = yaml.safe_load(file)

        # Validate required keys
        required_keys = ['base_path', 'local_path', 'files']
        missing_keys = [key for key in required_keys if key not in config]
        if missing_keys:
            raise ValueError(f"Missing required configuration keys: {', '.join(missing_keys)}")

        print("Configuration loaded successfully.")
        return config

    except FileNotFoundError:
        print(f"Error: The configuration file '{config_file}' was not found.")
        raise
    except yaml.YAMLError as e:
        print(f"Error: Failed to parse the YAML configuration file. Details: {e}")
        raise
    except ValueError as e:
        print(f"Error: {e}")
        raise

In [None]:
def standardise_dataframe(df, columns):
  """
  Converts unstandardised dataframe to standardised dataframe.

  Args:
    df (pd.DataFrame): Input dataframe.
    columns (list): List of standardised column names.

  Returns:
    standardised_df (pd.DataFrame): Standardized dataframe.
  """
  standardised_df = df.reindex(columns = columns, fill_value = np.nan)
  return standardised_df

In [None]:
def query(df):
  """
  Queries dataframe based on user input. Has case insensitive comparison.

  Args:
    df (pd.DataFrame): Input dataframe.

  Returns:
    query_result (pd.DataFrame): Query result.
  """
  print('\n### Query Dataframe ###')
  print(f"Available categories: {', '.join(df.columns)}\n")

  try:
    # User input category
    category = input('Enter category: ').strip()
    category_lower = category.lower() # convert to lower case for non-case sens.

    # Check if category exists
    categories_lower = [col.lower() for col in df.columns]
    if category_lower not in categories_lower:
      raise ValueError(f"Invalid category '{category}'. Please choose from available categories.")

    # Find actual column name (case sens match)
    category_actual = df.columns[categories_lower.index(category_lower)]

    # User input item (case insensitive and substring search)
    item = input(f"Enter item in category '{category_actual}': ").strip()

    # Handle empty item input
    if not item:
      print("Error: item cannot be empty.")
      return None

    # Query dataframe with case insensitive comparison
    query_result = df[df[category_actual].astype(str).str.contains(item, case = False, na = False)]

    if query_result.empty:
      print(f"No results found for item containing '{item}' in category '{category_actual}'.")
    else:
      print(f"Query Results for item ({len(query_result)} records(s) found):")
      return query_result

  except ValueError as e:
    print(f"Error: {e}")
    return None

In [None]:
def export_to_excel(df, output_path):
    """
    Exports the given DataFrame to an Excel file.

    Args:
        df (pd.DataFrame): The DataFrame to export.
        output_path (str): Path where the Excel file will be saved.

    Returns:
        None
    """
    try:
        df_filled = df.fillna('NA')
        df_filled.to_excel(output_path, index=False)
        print(f"Data successfully exported to {output_path}")
    except Exception as e:
        print(f"An error occurred while exporting to Excel: {e}")
        raise

# test_cases.py

This section contains test cases for each function, using Python in-built 'unittest' framework. The test cases ensures the function works as intended for majority of user cases.

In [None]:
class TestLoadConfig(unittest.TestCase):
    def setUp(self):
        """
        Set up temporary files and test data.
        """
        self.valid_config = {
            'base_path': '/data/',
            'local_path': '/local/',
            'files': {
                'file1': 'file1.xlsx',
                'file2': 'file2.xlsx'
            }
        }

        self.missing_keys_config = {
            'base_path': '/data/',
            'files': {
                'file1': 'file1.xlsx'
            }
        }

        # Create temporary YAML files for testing
        self.valid_config_file = tempfile.NamedTemporaryFile(delete=False, suffix='.yaml')
        self.valid_config_file.write(yaml.dump(self.valid_config).encode())
        self.valid_config_file.close()

        self.missing_keys_file = tempfile.NamedTemporaryFile(delete=False, suffix='.yaml')
        self.missing_keys_file.write(yaml.dump(self.missing_keys_config).encode())
        self.missing_keys_file.close()

        self.invalid_yaml_file = tempfile.NamedTemporaryFile(delete=False, suffix='.yaml')
        self.invalid_yaml_file.write(b"{invalid_yaml: [missing, closing, brace")
        self.invalid_yaml_file.close()

        self.nonexistent_file = '/nonexistent/config.yaml'

    def tearDown(self):
        """
        Clean up temporary files after tests.
        """
        os.remove(self.valid_config_file.name)
        os.remove(self.missing_keys_file.name)
        os.remove(self.invalid_yaml_file.name)

    def test_valid_config(self):
        """
        Test that a valid configuration file is loaded successfully.
        """
        print("\nTest Case: Valid config file loaded.")
        config = load_config(self.valid_config_file.name)
        self.assertEqual(config, self.valid_config)

    def test_missing_keys(self):
        """
        Test that a configuration file with missing keys raises a ValueError.
        """
        print("\nTest Case: Missing keys.")
        with self.assertRaises(ValueError) as context:
            load_config(self.missing_keys_file.name)
        self.assertIn("Missing required configuration keys", str(context.exception))

    def test_invalid_yaml(self):
        """
        Test that an invalid YAML file raises a yaml.YAMLError.
        """
        print("\nTest Case: Invalid YAML file.")
        with self.assertRaises(yaml.YAMLError):
            load_config(self.invalid_yaml_file.name)

    def test_file_not_found(self):
        """
        Test that a non-existent file raises a FileNotFoundError.
        """
        print("\nTest Case: File not found.")
        with self.assertRaises(FileNotFoundError):
            load_config(self.nonexistent_file)

In [None]:
class TestStandardiseDataFrame(unittest.TestCase):
    def setUp(self):
        # Define a sample DataFrame and standard columns
        self.input_df = pd.DataFrame({
            "A": [1, 2, 3],
            "B": [4, 5, 6],
            "C": [7, 8, 9]
        })

        self.standard_columns = ["A", "B", "C", "D", "E"]  # Standardised columns to test against

    def test_standardise_dataframe_adds_missing_columns(self):
        """
        Test that missing columns are added with NaN values.
        """
        print("\nTest Case: Missing columns added with NaN values.")
        result_df = standardise_dataframe(self.input_df, self.standard_columns)

        # Check that the columns in the result match the standard columns
        self.assertListEqual(list(result_df.columns), self.standard_columns)

        # Check that the new columns contain NaN
        self.assertTrue(result_df["D"].isnull().all())
        self.assertTrue(result_df["E"].isnull().all())

    def test_standardise_dataframe_removes_extra_columns(self):
        """
        Test that extra columns in the input DataFrame are removed.
        """
        print("\nTest Case: Extra columns in input dataframe are removed.")
        result_df = standardise_dataframe(self.input_df, ["A", "B"])  # Subset of standard columns

        # Check that only the specified columns remain
        self.assertListEqual(list(result_df.columns), ["A", "B"])

    def test_standardise_dataframe_keeps_original_data(self):
        """
        Test that the original data in the columns is preserved.
        """
        print("\nTest Case: Original data in columns is preserved.")
        result_df = standardise_dataframe(self.input_df, self.standard_columns)

        # Check that data in original columns remains unchanged
        pd.testing.assert_series_equal(result_df["A"], self.input_df["A"])
        pd.testing.assert_series_equal(result_df["B"], self.input_df["B"])
        pd.testing.assert_series_equal(result_df["C"], self.input_df["C"])

    def test_standardise_dataframe_empty_dataframe(self):
        """
        Test behavior with an empty input DataFrame.
        """
        print("\nTest Case: Empty input dataframe.")
        empty_df = pd.DataFrame()
        result_df = standardise_dataframe(empty_df, self.standard_columns)

        # Check that all standard columns are present and filled with NaN
        self.assertListEqual(list(result_df.columns), self.standard_columns)
        self.assertTrue(result_df.isnull().all().all())

    def test_standardise_dataframe_no_columns(self):
        """
        Test behavior when no columns are specified.
        """
        print("\nTest Case: No column specified.")
        result_df = standardise_dataframe(self.input_df, [])

        # Check that the resulting DataFrame has no columns
        self.assertTrue(result_df.empty)
        self.assertListEqual(list(result_df.columns), [])

In [None]:
class TestQueryFunction(unittest.TestCase):
    def setUp(self):
        # Sample DataFrame for testing
        self.df = pd.DataFrame({
            'CategoryA': ['Apple', 'Banana', 'Cherry', 'Apple Pie'],
            'CategoryB': ['Dog', 'Cat', 'Horse', 'Fish']
        })

    @patch('builtins.input', side_effect=['CategoryA', 'Apple'])
    def test_valid_query_case_insensitive(self, mock_input):
        """
        Test valid query with case-insensitive comparison.
        """
        print("\nTest Case: Valid query with case-insensitive comparison")
        result = query(self.df)
        expected = self.df[self.df['CategoryA'].str.contains('Apple', case=False, na=False)]
        pd.testing.assert_frame_equal(result, expected)

    @patch('builtins.input', side_effect=['CategoryB', 'cat'])
    def test_valid_query_substring_search(self, mock_input):
        """
        Test valid query with substring matching.
        """
        print("\nTest Case: Valid query with substring matching")
        result = query(self.df)
        expected = self.df[self.df['CategoryB'].str.contains('cat', case=False, na=False)]
        pd.testing.assert_frame_equal(result, expected)

    @patch('builtins.input', side_effect=['InvalidCategory', 'Apple'])
    def test_invalid_category(self, mock_input):
        """
        Test querying an invalid category.
        """
        print("\nTest Case: Invalid category")
        result = query(self.df)
        self.assertIsNone(result)

    @patch('builtins.input', side_effect=['CategoryA', 'NonexistentItem'])
    def test_no_results_found(self, mock_input):
        """
        Test querying an item that doesn't exist in the DataFrame.
        """
        print("\nTest Case: No results found.")
        result = query(self.df)
        self.assertIsNone(result)

    @patch('builtins.input', side_effect=['CategoryA', ''])
    def test_empty_item(self, mock_input):
        """
        Test querying with an empty item.
        """
        print("\nTest Case: Empty item.")
        result = query(self.df)
        self.assertIsNone(result)

In [None]:
class TestExportToExcel(unittest.TestCase):
    def setUp(self):
        # Sample DataFrame for testing
        self.sample_df = pd.DataFrame({
            "A": [1, 2, None],
            "B": ["Test", None, "Data"]
        })
        self.output_path = "test_output.xlsx"

    @patch("pandas.DataFrame.to_excel")
    def test_export_to_excel_success(self, mock_to_excel):
        """
        Test successful export of DataFrame to an Excel file.
        """
        print("\nTest Case: Export dataframe to excel file.")
        # Call the function
        export_to_excel(self.sample_df, self.output_path)

        # Assert that `to_excel` was called with correct arguments
        mock_to_excel.assert_called_once_with(self.output_path, index=False)

    @patch("pandas.DataFrame.to_excel", side_effect=PermissionError("Permission denied"))
    def test_export_to_excel_permission_error(self, mock_to_excel):
        """
        Test export when a PermissionError occurs.
        """
        print("\nTest Case: Export with PermissionError.")
        with self.assertRaises(PermissionError):
            export_to_excel(self.sample_df, self.output_path)

    def tearDown(self):
        # Clean up the test output file if it was accidentally created
        if os.path.exists(self.output_path):
            os.remove(self.output_path)

In [None]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False) # Run tests without exiting

# main.py

This section contains the main program prototype, which will be later transferred over to VSCode once everything is working as intended. The first part loads the configuration from the YAML file and defines the standard columns for the program.

In [None]:
# Load configuration from YAML file
config = load_config('/content/drive/MyDrive/Variant_DB/Mock_Local_Drive/config.yaml')

# Initialise file paths variables for excel files
base_path = config['base_path']
local_path = config['local_path']
files = config['files']

In [None]:
# Define standard columns
standard_columns = [
    'MRN',
    'Patient Name',
    'Phenotype',
    'Solved Status',
    'Gene',
    'Transcript',
    'Variant',
    'HGVSg',
    'HGVSc',
    'HGVSp'
]

This part imports the data into 5 separate, unsorted dataframes.

In [None]:
# Importing data into dataframes
print('Importing data...')

try:
  # Lab Cases
  lab_cases_df = pd.read_excel(
      f"{base_path}{files['lab_cases']}",
      sheet_name = 'Sheet1',
      header = 0,
      usecols = 'F, AF:AI, AL'
  )

  # ATM Summary
  atm_summary_df = pd.read_excel(
      f"{base_path}{files['atm_summary']}",
      sheet_name='SUMMARY',
      header = None,
      usecols = 'A:B'
  )

  # Additional filtering due to unconventional formatting
  print('Performing additional filtering...')
  atm_summary_df_filtered = atm_summary_df.iloc[9:16]  # Ensure rows are correctly indexed
  atm_summary_df_filtered.reset_index(drop=True, inplace=True)  # Reset index for clean output

  # Convert to a structured DataFrame
  print('Transforming into structured dataframe...')
  atm_structured_df = atm_summary_df_filtered.set_index(0).T  # Set column 0 as the header, transpose
  atm_structured_df.reset_index(drop=True, inplace=True)  # Reset index for clean final DataFrame

  # Invitae Summary
  invitae_summary_df = pd.read_excel(
      f"{base_path}{files['invitae_summary']}",
      sheet_name = 'Invitae list header',
      header = 0,
      usecols = 'E:F, M:T'
  )

  # Clinical Summary
  clinical_summary_df = pd.read_excel(
      f"{base_path}{files['clinical_summary']}",
      sheet_name = '11 Dec',
      header = 0,
      usecols = 'D:E, K')

  # Research Summary
  research_summary_df = pd.read_excel(
      f"{base_path}{files['research_summary']}",
      sheet_name = 'Overall List',
      header = 2,
      usecols = 'B, W:X, AF, AS:AT, AV:AW')

except FileNotFoundError as e:
  print(f"File not found: {e}")
except ValueError as e:
  print(f"Error reading sheet or invalid data format: {e}")

print('Data import complete.')

This part verifies the import with print statements. Comment out as needed.

In [None]:
# Print dataframes
#print('Printing dataframes...')

#print('\nLab Cases')
#print(lab_cases_df)

#print('\nATM Summary')
#print(atm_structured_df)

#print('\nInvitae Summary')
#print(invitae_summary_df)

#print('\nClinical Summary')
#print(clinical_summary_df)

#print('\nResearch Summary')
#print(research_summary_df)

This part uses key-value pairs to rename the columns from the imported dataframes to the standardised columns.

In [None]:
# Standardise columns across dataframes
print('Standardising columns across dataframes...')

# Lab Cases
lab_cases_cols = {
    'Number variants detected'    :'Var Count',
    'Variant_1_gene'              :'Gene',
    'Variant_1_HGVSg'             :'HGVSg',
    'Variant_1_HGVSc'             :'HGVSc',
    'Variant_1_HGVSp'             :'HGVSp',
    'Variant_1_zygosity'          :'Zygosity',
    'Variant_1_inheritance'       :'Inheritance',
    'Variant_1_Validation_Status' :'Solved Status'
}

lab_cases_df.rename(columns = lab_cases_cols, inplace = True)

# ATM Summary
atm_structured_df_cols = {
    'HGVS_Genomic_GRCh38/hg38'            :'HSVSg',
    'HGVS_MANE Select_Transcript_RefSeq'  :'Transcript',
    'HGVS_MANE Select_cDNA'               :'HGVSc',
    'HGVS_MANE Select_protein'            :'HGVSp',
    'HUGO gene symbol'                    :'Gene',
}

atm_structured_df.rename(columns = atm_structured_df_cols, inplace = True)

# Invitae Summary
invitae_summary_df_cols = {
    'Patient ID (MRN)'  :'MRN',
    'Patient Name'      :'Patient Name',
    'Result'            :'Solved Status',
    'Gene'              :'Gene',
    'Transcript'        :'Transcript',
    'Variant'           :'Variant',
    'HGVSc'             :'HGVSc',
    'Protein Change'    :'HGVSp',
}

invitae_summary_df.rename(columns = invitae_summary_df_cols, inplace = True)

# Clinical Summary
clinical_summary_df_cols = {
    'Identification No.'        :'MRN',
    'Medical Prob description'  :'Phenotype',
    'Patient Name'              :'Patient Name'
}

clinical_summary_df.rename(columns = clinical_summary_df_cols, inplace = True)

# Research Summary
research_summary_df_cols = {
    'IC No (MRN)'         :'MRN',
    'Name'                :'Patient Name',
    'Candidate gene (1)'  :'Gene',
    'Transcript (1)'      :'Transcript',
    'cDNA (1)'            :'HGVSc',
    'Protein (1)'         :'HGVSp',
    'AUTO STATUS'         :'Solved Status'
}

research_summary_df.rename(columns = research_summary_df_cols, inplace = True)

This part verifies the import with print statements. Comment out as needed.

In [None]:
# Print dataframes after renaming
#print('Printing dataframes...')

#print('\nLab Cases')
#print(lab_cases_df)

#print('\nATM Summary')
#print(atm_structured_df)

#print('\nInvitae Summary')
#print(invitae_summary_df)

#print('\nClinical Summary')
#print(clinical_summary_df)

#print('\nResearch Summary')
#print(research_summary_df)

This part uses the ***standardise_dataframe*** function to sort all of the imported, renamed dataframes into the standardised format for easier integration into a singular dataframe later on.

In [None]:
print('Standardising all dataframes...')

lab_cases_df_standard = standardise_dataframe(lab_cases_df, standard_columns)
atm_structured_df_standard = standardise_dataframe(atm_structured_df, standard_columns)
invitae_summary_df_standard = standardise_dataframe(invitae_summary_df, standard_columns)
clinical_summary_df_standard = standardise_dataframe(clinical_summary_df, standard_columns)
research_summary_df_standard = standardise_dataframe(research_summary_df, standard_columns)

print('Standardisation complete.')

This part verifies the import with print statements. Comment out as needed.

In [None]:
# Print dataframes to verify
#print('Individual dataframes:')
#print('\nLab Cases:')
#print(lab_cases_df_standard.head())

#print('\nATM:')
#print(atm_structured_df_standard.head())

#print('\nInvitae:')
#print(invitae_summary_df_standard.head())

#print('\nClinical:')
#print(clinical_summary_df_standard.head())

#print('\nResearch:')
#print(research_summary_df_standard.head())

This part combines all of the standardised dataframes into one singular dataframe.

In [None]:
# Combining dataframes into a single dataframe
print('\nCombining dataframes...')
combined_df = pd.concat([
    lab_cases_df_standard,
    atm_structured_df_standard,
    invitae_summary_df_standard,
    clinical_summary_df_standard,
    research_summary_df_standard
], ignore_index = True)

print('All dataframes combined successfully.')
print('Combined dataframe:')
print(combined_df)

This part uses the ***query*** function for the user to initiate a query using the new singular dataframe.

In [None]:
# Query
query_result = query(combined_df)

This part uses the ***export_to_excel*** function to export the query result into a file kept on the mock local drive.

In [None]:
# Export
output_file = f"{local_path}Results/combined_data.xlsx" # Fix to be dynamic.
export_to_excel(query_result, output_file)