In [None]:
# Work 6: Patient_ID Overlap Analysis and Visualization Using Venn Diagrams in Python [W6.Vennd_Overlap.ipynb]:

# "This notebook loads data from three CSVs, extracts unique Patient_IDs, calculates overlaps, and plots a Venn diagram with 
#  percentages, summary stats, then saves it."

########################################################################################################
#  Sequence list
########################################################################################################

# 1: Define file paths
# 2: Select the correct columns
# 3: Load data without excluded columns
# 4: Calculate unique Patient_ID identifiers in each dataset
# 5: Calculate the total number of unique Patient_ID identifiers combined
# 6: Draw the Venn diagram
# 7: Calculate percentages
# 8: Update Venn diagram labels with percentages
# 9: Add a summary of unique Patient_ID identifiers
# 10: Color-coded text
# 11: Add legend
# 12: Save the Venn diagram

########################################################################################################
########################################################################################################

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3

# 1: Define file paths
dgn_path = '/home/work/datafile1.csv' # path to 1. data file with patient_ids
kertomus_path = '/home/work/datafile2.csv' # path to 2. data file patient_ids
til_tap_path = '/home/work/datafile3.csv' # path to 3. data file patient_ids
plot_path = '/home/work/plots/vennd.png'

# 2: Select all columns, except certain excluded columns
dgn_exclude_columns = []  # Add excluded columns here if any
kertomus_exclude_columns = []  # Add excluded columns here if any
til_tap_exclude_columns = []  # Add excluded columns here if any

# 3: Load data without excluded columns
print("3.1: Loading DGN_KAIKKI data...") 
dgn_df = pd.read_csv(dgn_path, sep='|')
dgn_df = dgn_df[[col for col in dgn_df.columns if col not in dgn_exclude_columns]]

print("3.2: Loading KERTOMUS_DGN data...")
kertomus_df = pd.read_csv(kertomus_path, sep='|')
kertomus_df = kertomus_df[[col for col in kertomus_df.columns if col not in kertomus_exclude_columns]]

print("3.3: Loading TIL_TAP data...")
til_tap_df = pd.read_csv(til_tap_path, sep='|')
til_tap_df = til_tap_df[[col for col in til_tap_df.columns if col not in til_tap_exclude_columns]]

# 4: Calculate unique Patient_ID identifiers in each dataset
unique_dgn = set(dgn_df['Potilas_ID'])
unique_kertomus = set(kertomus_df['Potilas_ID'])
unique_til_tap = set(til_tap_df['Potilas_ID'])

# 5: Calculate the total number of unique Patient_ID identifiers combined
combined_unique_ids = len(unique_dgn.union(unique_kertomus).union(unique_til_tap))
print(f"Total unique Patient_ID identifiers before combining: {combined_unique_ids}")

# 6: Draw the Venn diagram
plt.figure(figsize=(12, 10))
venn_diagram = venn3([unique_dgn, unique_kertomus, unique_til_tap],
                     ('DGN_KAIKKI', 'KERTOMUS_DGN', 'TIL_TAP'))
plt.title("Overlap of Patient_IDs across datasets")

# 7: Calculate percentages
total_ids = len(unique_dgn.union(unique_kertomus).union(unique_til_tap))
subsets = venn_diagram.subset_labels
percentages = [int(label.get_text()) / total_ids * 100 if label else 0 for label in subsets]

# 8: Update Venn diagram labels with percentages
for label, percentage in zip(subsets, percentages):
    if label:
        label.set_text(f'{label.get_text()}\n({percentage:.1f}%)')

# 9: Add a summary of unique Patient_ID identifiers
plt.figtext(0.5, -0.05, f'Total unique Patient_ID identifiers: {combined_unique_ids}', ha='center', fontsize=12)

# 10: Color-coded text
for text in venn_diagram.set_labels:
    if text is not None:
        text.set_fontsize(12)

venn_diagram.set_labels[0].set_color('red')
venn_diagram.set_labels[1].set_color('green')
venn_diagram.set_labels[2].set_color('blue')

# 11: Add legend
plt.legend(handles=[plt.Line2D([0], [0], color='red', lw=4, label=f'DGN_KAIKKI ({len(unique_dgn)})'),
                    plt.Line2D([0], [0], color='green', lw=4, label=f'KERTOMUS_DGN ({len(unique_kertomus)})'),
                    plt.Line2D([0], [0], color='blue', lw=4, label=f'TIL_TAP ({len(unique_til_tap)})')],
           loc='upper left', bbox_to_anchor=(1, 1))

# 12: Save the Venn diagram
plt.savefig(plot_path, bbox_inches='tight')
print(f"Venn diagram saved to path: {plot_path}")






