In [1]:
# Import the pandas library for data and analysis manipulation.
# Assign pandas library to a shorter alias 'pd'.

import pandas as pd

In [3]:
# Define an empty dictionary named 'dtype_dict'
# This dictionary will map certain data attributes to their respective data types.

dtype_dict = {
    "vcf_filter": str, # Data in "vcf_filter" column should be treated as type 'string' (text).
    "vcf_mutect": float, # Data in "vcf_count_mutect" should be treated as floating-point numbers.
    "cosmic_haem": str,  # Data in "cosmic_haem" column should be treated as type 'string' (text).
    "vcf_pindel": float, # Data in "vcf_count_pindel" should be treated as floating-point numbers.
    "vaf": float, # Data in "vaf" should be treated as floating-point numbers.
    "depth": float, # Data in "depth" should be treated as floating-point numbers.
    "gnomAD_AF": float, # Data in "gnomad_AF" should be treated as floating-point numbers. 
    "genes": str, # Data in "gene_name" column should be treated as type 'string' (text).
    "variants": str, # Data in "variant" column should be treated as type 'string' (text).
    "var_type": str, # Data in "var_type" column should be treated as type 'string' (text).
    "virtual_panel": str, # Data in "virtual_panel" column should be treated as type 'string' (text).
}

In [9]:
# Load a CSV file into a pandas DataFrame.
# Specify additional parameters and store the resulting DataFrame in the variable 'data'.

# The 'read_csv' function is used to load the file 'mock_validation_rules.csv'.
data = pd.read_csv(r'C:\Users\PC\Desktop\NGS-Mock-Variant-Analysis\4.data\mock_validation_rules.csv', 
                   # This disables low memory mode and allows pandas to read the entire file into memory, even if it's very large.
                   low_memory=False,  
                   # 'dtype_dict' specifies the data types for specific columns in the CSV file.
                   dtype=dtype_dict) 

In [9]:
# Display the first few rows of the DataFrame using the 'head()' method without argument to show first 5 rows by default.
# This is neccessary to quickly check if the data was loaded correctly.

data.head()

Unnamed: 0,sample_id,variants,genes,cNomen,pNomen,run,vaf,depth,vcf_mutect,vcf_pindel,...,gnomAD_AF,gnomAD_AF_Pct,cosmic_id,cosmic_haem,run_freq,total_freq,length,var_type,technical_status,retain_discard
0,Mock001,KRAS_2_160883814_A/G,KRAS,c.1077A>G,p.Ala1367Val,Run1,0.377,354.0,22.0,26.0,...,0.019014,1.9014,COSM444461,0.0,2,86,0,deletion,Discard,Discard
1,Mock002,KRAS_20_183670202_A/G,KRAS,c.2772A>G,p.Ala228Val,Run1,0.727,254.0,29.0,27.0,...,0.011973,1.1973,COSM989868,6.0,3,84,0,indel,Valid,Retain
2,Mock003,KRAS_22_5738866_A/G,KRAS,c.2073A>G,p.Ala218Val,Run3,0.163,291.0,41.0,35.0,...,0.00312,0.312,COSM924050,2.0,0,80,0,deletion,Discard,Discard
3,Mock004,BRCA2_18_377694477_A/G,BRCA2,c.2030A>G,p.Ala947Val,Run3,0.067,619.0,39.0,50.0,...,0.017324,1.7324,COSM606687,2.0,3,68,0,substitution,Valid,Discard
4,Mock005,ARID1A_15_547373809_A/G,ARID1A,c.382A>G,p.Ala782Val,Run2,0.599,718.0,33.0,14.0,...,0.014161,1.4161,COSM938076,9.0,0,11,0,indel,Discard,Retain


In [11]:
# Check the number of rows in the DataFrame.

print(len(data))

100


In [13]:
# Find rows where technical_status does not match the 'retain_discard' column.
# 1. Filter the rows where technical_status is 'valid' and retain_discard is **not** 'Retain'.
# Combine the conditions and store the filtered rows in a new DataFrame named `valid_mismatch`.

valid_mismatch = data[(data['technical_status'] == 'valid') & (data['retain_discard'] != 'Retain')]

# Calculate the number of rows in `valid_mismatch`.

print("Lenght of rows with 'valid' technical_status but incorrect retain_discard:")
print(len(valid_mismatch))

Lenght of rows with 'valid' technical_status but incorrect retain_discard:
0


In [15]:
# 2. Filter the rows where technical_status is 'discarded' and retain_discard is **not** 'Retain' or 'Unknown'.
# The `.isin([...])` check `retain_discard` is in the list and the `~` select rows where `retain_discard` is **not** in `['Discard', 'Unknown']`.
# Combine the conditions and store the filtered rows in a new DataFrame named `discarded_mismatch`.

discarded_mismatch = data[(data['technical_status'] == 'discarded') & (~data['retain_discard'].isin(['Discard', 'Unknown']))]

# Calculate the number of rows in `discarded_mismatch`.

print("Lenght of rows with 'discarded' technical_status but incorrect retain_discard:")
print(len(discarded_mismatch))

Lenght of rows with 'discarded' technical_status but incorrect retain_discard:
0


In [17]:
# 1. Define a function named `status_match` that takes a `row` as input.

   #   ii. If `technical_status` is `'discarded'`, then `retain_discard` must be in the list `['Discard', 'Unknown']`.

# Define equivalence rules for comparison
def status_match(row):
    # Check the conditions for equivalence:
    # i. If `technical_status` is `'valid'`, then `retain_discard` must be `'Retain'`.
    if row['technical_status'] == 'valid' and row['retain_discard'] == 'Retain':
        # Return `True` if the equivalence rule is satisfied.
        return True
    # ii. Also check if `technical_status` is `'discarded'`, then `retain_discard` must be in the list `['Discard', 'Unknown']`.
    elif row['technical_status'] == 'discarded' and row['retain_discard'] in ['Discard', 'Unknown']:
        # Return `True` if the equivalence rule is satisfied.
        return True
    # Return `False` if the equivalence rule is not met.
    return False

In [19]:
# Store the result in a new column 'status_match' in the DataFrame.
# Use the 'apply' method to apply the matching rule 'status_match' to each row.
# The 'axis=1' argument specifies that the function should be applied row-wise, as (axis=0 would apply it to columns).

data['status_match'] = data.apply(status_match, axis=1)

In [21]:
# Filter all the rows where the statuses match.
# Select rows where `status_match` is `True` and store these rows in a new DataFrame named `mismatch_rows`.

match_rows = data[data['status_match'] == True]

In [23]:
# Check the number of all the rows where the statuses match in the DataFrame.

print("Lenght of all the rows where the statuses match:")
print(len(match_rows))

Lenght of all the rows where the statuses match:
0


In [25]:
# Display all the match rows to show where the equivalence rule is satisfied.

match_rows

Unnamed: 0,sample_id,variants,genes,cNomen,pNomen,run,vaf,depth,vcf_mutect,vcf_pindel,...,gnomAD_AF_Pct,cosmic_id,cosmic_haem,run_freq,total_freq,length,var_type,technical_status,retain_discard,status_match


In [46]:
# Write the DataFrame to a CSV file named 'match_rows.csv'.
# Use the 'to_csv()' method on the DataFrame 'data' to save it to a CSV file.
# Set the 'index' parameter to False to prevent writing row indices to the CSV file.

match_rows.to_csv('match_rows.csv', index=False)

In [27]:
# Print the new match rows to the console using the 'print()' function.
# Call the 'print()' function with the message: "all the match_rows saved to 'mismatch_rows.csv'.

print("all the match_rows saved to 'match_rows.csv'")

all the match_rows saved to 'match_rows.csv'


In [29]:
# Filter all the rows where the statuses do not match.
# Select rows where `status_match` is `False` and store these rows in a new DataFrame named `mismatch_rows`.

mismatch_rows = data[data['status_match'] == False]

In [31]:
# Check the number of all the rows where the statuses do not match in the DataFrame.

print("Lenght of all the rows where the statuses do not match:")
print(len(mismatch_rows))

Lenght of all the rows where the statuses do not match:
100


In [33]:
# Display all the mismatch rows to show where the equivalence rule is violated.

mismatch_rows

Unnamed: 0,sample_id,variants,genes,cNomen,pNomen,run,vaf,depth,vcf_mutect,vcf_pindel,...,gnomAD_AF_Pct,cosmic_id,cosmic_haem,run_freq,total_freq,length,var_type,technical_status,retain_discard,status_match
0,Mock001,KRAS_2_160883814_A/G,KRAS,c.1077A>G,p.Ala1367Val,Run1,0.377,354.0,22.0,26.0,...,1.9014,COSM444461,0.0,2,86,0,deletion,Discard,Discard,False
1,Mock002,KRAS_20_183670202_A/G,KRAS,c.2772A>G,p.Ala228Val,Run1,0.727,254.0,29.0,27.0,...,1.1973,COSM989868,6.0,3,84,0,indel,Valid,Retain,False
2,Mock003,KRAS_22_5738866_A/G,KRAS,c.2073A>G,p.Ala218Val,Run3,0.163,291.0,41.0,35.0,...,0.3120,COSM924050,2.0,0,80,0,deletion,Discard,Discard,False
3,Mock004,BRCA2_18_377694477_A/G,BRCA2,c.2030A>G,p.Ala947Val,Run3,0.067,619.0,39.0,50.0,...,1.7324,COSM606687,2.0,3,68,0,substitution,Valid,Discard,False
4,Mock005,ARID1A_15_547373809_A/G,ARID1A,c.382A>G,p.Ala782Val,Run2,0.599,718.0,33.0,14.0,...,1.4161,COSM938076,9.0,0,11,0,indel,Discard,Retain,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Mock096,ARID1A_15_103119975_A/G,ARID1A,c.2147A>G,p.Ala148Val,Run3,0.101,600.0,19.0,1.0,...,1.7944,COSM719669,0.0,3,64,0,indel,Valid,Retain,False
96,Mock097,KMT2C_21_183025592_A/G,KMT2C,c.1946A>G,p.Ala238Val,Run3,0.892,477.0,21.0,36.0,...,1.2662,COSM868871,6.0,3,18,0,substitution,Discard,Retain,False
97,Mock098,DNMT3A_9_398039145_A/G,DNMT3A,c.185A>G,p.Ala170Val,Run3,0.342,257.0,34.0,1.0,...,0.6984,COSM677026,3.0,0,86,0,deletion,Valid,Discard,False
98,Mock099,ANA10_19_419898203_A/G,ANA10,c.2545A>G,p.Ala1129Val,Run2,0.721,159.0,1.0,37.0,...,1.7942,COSM541803,5.0,3,88,0,indel,Valid,Retain,False


In [56]:
# Write the DataFrame to a CSV file named 'mismatch_rows.csv'.
# Use the 'to_csv()' method on the DataFrame 'data' to save it to a CSV file.
# Set the 'index' parameter to False to prevent writing row indices to the CSV file.

mismatch_rows.to_csv('mismatch_rows.csv', index=False)

In [35]:
# Print the new mismatch rows to the console using the 'print()' function.
# Call the 'print()' function with the message: "all the mismatch_rows saved to 'mismatch_rows.csv'.

print("all the mismatch_rows saved to 'mismatch_rows.csv'")

all the mismatch_rows saved to 'mismatch_rows.csv'
