In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Read in the data for processed tumor volume for each lesion:
df = pd.read_csv('processed.volumes.lesions.csv')

# Split name cases
df['Case.Patient'] = df['Case'].apply(lambda x: x.split('_')[0])

df['Case.Date'] = df['Case'].apply(lambda x: x.split('_')[1])
df['Case.Date'] = pd.to_datetime(df['Case.Date'], format='%Y%m%d')

df['Case.Scan'] = df['Case'].apply(lambda x: '_'.join(x.split('_')[2:]))

df = df.drop(columns=['Case'])

# Adapt volume to mL instead of mm^3
df['Volume'] = df['Volume'] / 1000

df = df.drop(columns=['PhysicalCoordinateSagittal', 'PhysicalCoordinateCoronal', 'PhysicalCoordinateTransverse', 'VoxelCoordinateSagittal',	
                      'VoxelCoordinateCoronal', 'VoxelCoordinateTransverse'])

start_df = df
volume_df = df
start_df

In [None]:
# Group by patient_id and find the last check-up date for each patient
last_check_up_dates = volume_df.groupby('Case.Patient')['Case.Date'].max().reset_index()
last_check_up_dates

In [None]:
df = df.dropna(subset=['Lesion.ID'])
df

## Total Volume Analysis


In [None]:
import numpy as np

# We dont need the coordinates at this stage, so we can remove them safely
# df = df.drop(columns=['CoordinateSagittal', 'CoordinateCoronal', 'CoordinateTransverse'])

# Compute total volume and range for each patient and date
df = df.groupby(['Lesion.ID', 'Case.Date', 'Case.Scan']).sum().reset_index()
df = df.drop(columns='Case.Scan')
df = df.groupby(['Lesion.ID', 'Case.Date']).agg(
    Volume_Mean=('Volume', 'mean'),
    Volume_Range=('Volume', lambda x: np.nan if x.size == 1 else x.max() - x.min())
).reset_index()
df = df.rename(columns={'Volume_Mean': 'Volume.Mean', 'Volume_Range': 'Volume.Range'})

df

## Build an interval dataset


In [None]:
# Convert Case.Date to datetime
df['Case.Date'] = pd.to_datetime(df['Case.Date'])

# Sort by Case.Patient and Case.Date
df = df.sort_values(by=['Lesion.ID', 'Case.Date'])

# Create shifted columns for pairs
df['Case.Date.End'] = df.groupby('Lesion.ID')['Case.Date'].shift(-1)
df['Volume.Mean.End'] = df.groupby('Lesion.ID')['Volume.Mean'].shift(-1)


# Rename original columns for clarity
df.rename(columns={
    'Case.Date': 'Case.Date.Start',
}, inplace=True)

# Drop rows where Case.Date.End is NaN (the last entry for each patient)
df = df.dropna(subset=['Case.Date.End'])

# Select and reorder relevant columns
# df = df[['Case.Patient', 'Case.Date.Start', 'Case.Date.End', 'Volume.Mean.Start', 'Volume.Mean.End', 'Volume.Range.Start', 'Volume.Range.End']]

df

In [None]:
# create lesion Id to track each lesion
df.insert(0, 'ID', range(1, 1 + len(df)))
anon_df = df.merge(start_df[['Case.Patient', 'Lesion.ID']], on='Lesion.ID', how='left')
df = anon_df.drop_duplicates(subset=['ID']).drop(columns=['ID'])
# an[an['Lesion.ID'] == 112]
df

## Load Treatment data

In [None]:
# load treatment data
treatment = pd.read_excel('W_23_5814_2024-03-06.xlsx', sheet_name='Oncolytica')

# filter on the treatment of interest
treatment = treatment.loc[treatment['ATC_CODE'].apply(lambda x: str(x).startswith('L01') or str(x).startswith('L02'))]
treatment = treatment.loc[treatment['ISLAATST']]
treatment = treatment.loc[treatment['DOSERING'].apply(lambda x:'per dag 0' not in str(x))]

# select columns of interest 
treatment = treatment[['PATIENTNR', 'VOORSCHRIJFDATUM', 'VOORSCHRIJFEINDDATUM']]
treatment.rename(columns={'PATIENTNR': 'Case.Patient', 'VOORSCHRIJFDATUM': 'Treatment.Start', 'VOORSCHRIJFEINDDATUM': 'Treatment.End'}, inplace=True)

# laod surgery data
surgery = pd.read_excel('W_23_5814_2024-03-06.xlsx', sheet_name='Surgery')

# filter on the ones for the liver: *lever*, *embolisatie* 
surgery = surgery.loc[surgery['OKVR_Verrichting_omschrijving'].apply(lambda x: 'lever' in str(x).lower() or 'embolisatie' in str(x).lower() or 'rfa' in str(x).lower())]

# select columns of interest
surgery = surgery[['PATIENTNR', 'OK_Operatiedatum']]
surgery.rename(columns={'PATIENTNR': 'Case.Patient', 'OK_Operatiedatum': 'Treatment.Start'}, inplace=True)
surgery['Treatment.End'] = surgery['Treatment.Start']

# merge treatment and surgery data as intervention data
# surgery date becomes the start and end of the intervention
intervention = pd.concat([treatment, surgery], axis=0, ignore_index=True).reset_index(drop=True)

# load patient ID keys for anonymization
keys = pd.read_csv('20230926 JADS Export Report Patients.csv')

# use anonymization keys to replace patient IDs
keys = keys.loc[~(keys['Patient ID'] == '20401820xxx187484')]
keys['Patient ID'] = keys['Patient ID'].astype(int)

intervention = pd.merge(intervention, keys, left_on='Case.Patient', right_on='Patient ID', how='left')
intervention = intervention.drop(columns=['Case.Patient', 'Patient ID'])
intervention.rename(columns={'Anonymized Name': 'Case.Patient'}, inplace=True)

intervention

## add treatment interval indicator

In [None]:
# Convert Case.Date.Start and Case.Date.End to datetime
df['Case.Date.Start'] = pd.to_datetime(df['Case.Date.Start'])
df['Case.Date.End'] = pd.to_datetime(df['Case.Date.End'])

# Convert Treatment.Start and Treatment.End to datetime
intervention['Treatment.Start'] = pd.to_datetime(intervention['Treatment.Start'])
intervention['Treatment.End'] = pd.to_datetime(intervention['Treatment.End'])

# Function to check if treatment interval overlaps with case interval
def check_overlap(row, treatment_df):
    patient = row['Case.Patient']
    start = row['Case.Date.Start']
    end = row['Case.Date.End']
    treatment_intervals = treatment_df[treatment_df['Case.Patient'] == patient]
    for _, treatment in treatment_intervals.iterrows():
        if (treatment['Treatment.Start'] <= end) and (treatment['Treatment.End'] >= start):
            return True
    return False

# Apply the function to each row in df to create the indicator column
df['Treatment.Indicator'] = df.apply(lambda row: check_overlap(row, intervention), axis=1)

df

## Filter lesions on Non-treatment vs Treatment Interval

In [None]:
dfnotreatment =  df[df['Treatment.Indicator'] == False]
dfnotreatment = dfnotreatment.drop(columns=['Treatment.Indicator'])
dfnotreatment


In [None]:
dftreatment =  df[(df['Treatment.Indicator'] == True)]
dftreatment = dftreatment.drop(columns=['Treatment.Indicator'])
dftreatment

## Ensuring Continuity of Treatment Periods in Patient Data

### No treatment period

In [None]:
df = pd.DataFrame(dfnotreatment)
df['Case.Date.Start'] = pd.to_datetime(df['Case.Date.Start'])
df['Case.Date.End'] = pd.to_datetime(df['Case.Date.End'])

# Find missing end dates and add them as new rows ensuring continuity
new_rows = []

for group_id in df['Lesion.ID'].unique():
    group_df = df[df['Lesion.ID'] == group_id]
    end_dates = group_df['Case.Date.End'].tolist()
    start_dates = group_df['Case.Date.Start'].tolist()
    
    for end_date in end_dates:
        if end_date not in start_dates:
            volume_end = group_df[group_df['Case.Date.End'] == end_date]['Volume.Mean.End'].values[0]
            new_row = {
                'Case.Date.Start': end_date,
                'Case.Date.End': pd.NaT,  # No end date for the new row
                'Volume.Mean.Start': volume_end,
                'Volume.Mean.End': pd.NaT,  # No volume end for the new row
                'Lesion.ID': group_id
            }
            new_rows.append(new_row)

# Append new rows to the original DataFrame using pd.concat
new_rows_df = pd.DataFrame(new_rows)
df = pd.concat([df, new_rows_df], ignore_index=True)

# Sort the DataFrame by Group.ID and Start Date
dfnt = df.sort_values(by=['Lesion.ID', 'Case.Date.Start']).reset_index(drop=True)

dfnt

In [None]:
final_dfnotreatment = dfnt[['Lesion.ID', 'Case.Date.Start','Volume.Mean','Volume.Mean.Start']]
final_dfnotreatment['volume_combined'] = final_dfnotreatment['Volume.Mean'].combine_first(final_dfnotreatment['Volume.Mean.Start'])
final_dfnotreatment.drop(columns=['Volume.Mean', 'Volume.Mean.Start'], inplace=True)
final_dfnotreatment

### Convert to CSV

In [None]:
#This csv file will be used in R for the 'tumgr' library
final_dfnotreatment.to_csv('grouped_lesions_nt.csv', index=False)

### Treatment period

In [None]:
df = pd.DataFrame(dftreatment)
df['Case.Date.Start'] = pd.to_datetime(df['Case.Date.Start'])
df['Case.Date.End'] = pd.to_datetime(df['Case.Date.End'])

# Find missing end dates and add them as new rows ensuring continuity
new_rows = []

for group_id in df['Lesion.ID'].unique():
    group_df = df[df['Lesion.ID'] == group_id]
    end_dates = group_df['Case.Date.End'].tolist()
    start_dates = group_df['Case.Date.Start'].tolist()
    
    for end_date in end_dates:
        if end_date not in start_dates:
            volume_end = group_df[group_df['Case.Date.End'] == end_date]['Volume.Mean.End'].values[0]
            new_row = {
                'Case.Date.Start': end_date,
                'Case.Date.End': pd.NaT,  # No end date for the new row
                'Volume.Mean.Start': volume_end,
                'Volume.Mean.End': pd.NaT,  # No volume end for the new row
                'Lesion.ID': group_id
            }
            new_rows.append(new_row)

# Append new rows to the original DataFrame using pd.concat
new_rows_df = pd.DataFrame(new_rows)
df = pd.concat([df, new_rows_df], ignore_index=True)

# Sort the DataFrame by Group.ID and Start Date
dft = df.sort_values(by=['Lesion.ID', 'Case.Date.Start']).reset_index(drop=True)

dft

In [None]:
final_dftreatment = dft[['Lesion.ID', 'Case.Date.Start','Volume.Mean','Volume.Mean.Start']]
final_dftreatment['volume_combined'] = final_dftreatment['Volume.Mean'].combine_first(final_dftreatment['Volume.Mean.Start'])
final_dftreatment.drop(columns=['Volume.Mean', 'Volume.Mean.Start'], inplace=True)
final_dftreatment

In [None]:
#This csv file will be used in R for the 'tumgr' library
final_dftreatment.to_csv('grouped_lesions_t_interval.csv', index=False)

## Tumor Grades

In [None]:
# Reading in tumor-grades data from the Tumor sheet data
tumordata = pd.read_excel('W_23_5814_2024-03-06.xlsx', sheet_name='Tumor')
tumorgrades = tumordata[['PATIENTNR', 'TRTU_Morfologie_oms']]

# # use anonymization keys to replace patient IDs
keys = keys.loc[~(keys['Patient ID'] == '20401820xxx187484')]
keys['Patient ID'] = keys['Patient ID'].astype(int)
tumorgrades = pd.merge(tumorgrades, keys, left_on='PATIENTNR', right_on='Patient ID', how='left')
tumorgrades = tumorgrades.drop(columns=['PATIENTNR', 'Patient ID'])
tumorgrades.rename(columns={'Anonymized Name': 'Case.Patient'}, inplace=True)

# Split the morfologie column into 'type' and 'tumorgrade'
tumorgrades[['type', 'tumorgrade']] = tumorgrades['TRTU_Morfologie_oms'].str.split(', ', expand=True)

# Drop the original 'full_name' column if you no longer need it
tumorgrades.drop(columns=['TRTU_Morfologie_oms'], inplace=True)
tumorgrades

# Growth rates Max, Mean and Range G

In [None]:
tumorgrowth = pd.read_csv('results_lesions_nt.csv')
# tumorgrowth = tumorgrowth[(tumorgrowth['d'].isna()) & (tumorgrowth['phi'].isna())]
tumorgrowth = tumorgrowth.dropna(subset=['g'])
tumorgrowth

In [None]:
max_value_index = tumorgrowth['g'].idxmax()
# Drop the row with the maximum value
tumorgrowth = tumorgrowth.drop(index=max_value_index)
tumorgrowth

In [None]:
growthrates = dfnotreatment[['Case.Patient', 'Lesion.ID']].merge(tumorgrowth, left_on='Lesion.ID', right_on='name', how='left')
growthrates = growthrates.drop(columns=['name', 'N', 'type', 'selectedFit']).dropna(subset=['g'])
# growthrates = growthrates.groupby('Lesion.ID')[['g', 'd', 'phi']].mean().reset_index()

# #merging growthrates with the grades
gandgrade = growthrates.merge(tumorgrades[['Case.Patient', 'tumorgrade']], on='Case.Patient', how='left')
gandgrade = gandgrade.drop_duplicates(subset=['Lesion.ID'])
# Remove white space from the 'Category' column
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].str.strip()
gandgrade1 = gandgrade
gandgrade1['tumorgrade'] = gandgrade1['tumorgrade'].replace({'NNO': 'graad 3'})
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].replace({'NNO': 'graad 3'})
gandgrade

In [None]:
# aggregate the dataset in order to get max growth, mean growth and range growth
agg_df = gandgrade.groupby(['Case.Patient'])['g'].agg(
    max_g='max',
    mean_g='mean',
    range_g=lambda x: x.max() - x.min()
).reset_index()

gandgrade = gandgrade[['Case.Patient', 'tumorgrade']].merge(agg_df, on=['Case.Patient'])
gandgrade.drop_duplicates(subset=['Case.Patient'], inplace=True)
gandgrade2 = gandgrade
gandgrade

## Survival Analysis: Mean, Max, Range Growth Rates

In [None]:
tumordata = pd.read_excel('W_23_5814_2024-03-06.xlsx', sheet_name='Tumor')
tumorgrades = tumordata[['PATIENTNR', 'Date_diagnosis (Stefano)', 'HiX_Date_of_death', 'TRTU_Laatste_follow_up_datum'
,'TRTU_Patientstatus_bij_laatste_follow_up_oms']]
tumorgrades['overleden'] = tumorgrades['TRTU_Patientstatus_bij_laatste_follow_up_oms'].str.contains('overleden', case=False, na=False)

# # use anonymization keys to replace patient IDs
keys = keys.loc[~(keys['Patient ID'] == '20401820xxx187484')]
keys['Patient ID'] = keys['Patient ID'].astype(int)
tumorgrades = pd.merge(tumorgrades, keys, left_on='PATIENTNR', right_on='Patient ID', how='left')
tumorgrades = tumorgrades.drop(columns=['PATIENTNR', 'Patient ID'])
tumorgrades.rename(columns={'Anonymized Name': 'Case.Patient'}, inplace=True)

# tumorgrades.drop(columns=['TRTU_Morfologie_oms'], inplace=True)
# merging mean, max,range growth rates with the 'end_date'
gradesdeath = gandgrade.merge(tumorgrades, on='Case.Patient')
gradesdeath = gradesdeath.merge(last_check_up_dates, on='Case.Patient')
gradesdeath['end_date'] = gradesdeath['HiX_Date_of_death'].combine_first(gradesdeath['TRTU_Laatste_follow_up_datum'])
gradesdeath['end_date'] = gradesdeath['end_date'].combine_first(gradesdeath['Case.Date'])
gradesdeath

## Correlation first g with mean g

In [None]:
# first g per patient
first_lesion_per_patient = gandgrade1.groupby('Case.Patient').first().reset_index()

#merging first g's wiht mean g's
first_and_mean_g = first_lesion_per_patient.merge(gandgrade2[['Case.Patient', 'mean_g']], on='Case.Patient')
first_and_mean_g.rename(columns={
    'g': 'first_g',
}, inplace=True)

#correlation between first g and mean g
correlation = first_and_mean_g['mean_g'].corr(first_and_mean_g['first_g'])

print("Correlation between mean_g and first_g:", correlation)
first_and_mean_g


## Kaplanmeier Max/Mean/range_g + validation with pairwise logrank

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import pairwise_logrank_test

# Divide the 'max_g' column into percentiles
growth_rate_percentiles = np.percentile(gradesdeath['max_g'], [33, 66, 99], interpolation='nearest')
print(growth_rate_percentiles)

# Prepare data for Kaplan-Meier estimator for each percentile range
kmf = KaplanMeierFitter()

# Collect all durations and event observations
all_durations = []
all_event_observed = []
all_labels = []

colors = ['green', 'orange', 'red']  # Define colors for each group

for i, percentile in enumerate(growth_rate_percentiles):
    if i == 0:
        mask = gradesdeath['max_g'] <= percentile
        label = f'<= {percentile:.3f}'
    elif i == len(growth_rate_percentiles) - 1:
        mask = gradesdeath['max_g'] > growth_rate_percentiles[i - 1]
        label = f'> {growth_rate_percentiles[i - 1]:.3f}'
    else:
        mask = (gradesdeath['max_g'] > growth_rate_percentiles[i - 1]) & (gradesdeath['max_g'] <= percentile)
        label = f'{growth_rate_percentiles[i - 1]:.3f}-{percentile:.3f}'
        
    if mask.any():  # Check if there are any data points in this range
        durations = (gradesdeath['end_date'] - gradesdeath['Date_diagnosis (Stefano)']).dt.days[mask]
        event_observed = gradesdeath['overleden'][mask]
        
        kmf.fit(durations=durations, event_observed=event_observed, label=label)
        kmf.plot(color=colors[i])
        
        # Collect data for pairwise log-rank test
        all_durations.extend(durations)
        all_event_observed.extend(event_observed)
        all_labels.extend([label] * len(durations))

# Convert collected data to pandas Series
all_durations = pd.Series(all_durations)
all_event_observed = pd.Series(all_event_observed)
all_labels = pd.Series(all_labels)

# Perform pairwise log-rank test
results = pairwise_logrank_test(all_durations, all_labels, event_observed=all_event_observed)
print(results)

# Plot settings
plt.title('Kaplan-Meier Survival Curve by Growth Rate Percentile Max g')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.legend(title='Growth Rate Percentile')
plt.show()


In [None]:
# Divide the 'range_g' column into percentiles
growth_rate_percentiles = np.percentile(gradesdeath['range_g'], [33, 66, 99], interpolation='nearest')
print(growth_rate_percentiles)

# Prepare data for Kaplan-Meier estimator for each percentile range
kmf = KaplanMeierFitter()

# Collect all durations and event observations
all_durations = []
all_event_observed = []
all_labels = []

# Define colors for each group
colors = ['green', 'orange', 'red']

for i, percentile in enumerate(growth_rate_percentiles):
    if i == 0:
        mask = gradesdeath['range_g'] <= percentile
        label = f'<= {percentile:.3f}'
    elif i == len(growth_rate_percentiles) - 1:
        mask = gradesdeath['range_g'] > growth_rate_percentiles[i - 1]
        label = f'> {growth_rate_percentiles[i - 1]:.3f}'
    else:
        mask = (gradesdeath['range_g'] > growth_rate_percentiles[i - 1]) & (gradesdeath['range_g'] <= percentile)
        label = f'{growth_rate_percentiles[i - 1]:.3f}-{percentile:.3f}'
        
    if mask.any():  # Check if there are any data points in this range
        durations = (gradesdeath['end_date'] - gradesdeath['Date_diagnosis (Stefano)']).dt.days[mask]
        event_observed = gradesdeath['overleden'][mask]
        
        kmf.fit(durations=durations, event_observed=event_observed, label=label)
        kmf.plot(color=colors[i])
        
        # Collect data for pairwise log-rank test
        all_durations.extend(durations)
        all_event_observed.extend(event_observed)
        all_labels.extend([label] * len(durations))

# Convert collected data to pandas Series
all_durations = pd.Series(all_durations)
all_event_observed = pd.Series(all_event_observed)
all_labels = pd.Series(all_labels)

# Perform pairwise log-rank test
results = pairwise_logrank_test(all_durations, all_labels, event_observed=all_event_observed)
print(results)

# Plot settings
plt.title('Kaplan-Meier Survival Curve by Growth Rate Percentile Range g')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.legend(title='Growth Rate Percentile')
plt.show()


In [None]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import pairwise_logrank_test

# Assuming gradesdeath is your DataFrame
gradesdeath['HiX_Date_of_death'].fillna(pd.Timestamp.now(), inplace=True)  # Replace NaN with current date for patients who are still alive

# Divide the 'mean_g' column into percentiles
growth_rate_percentiles = np.percentile(gradesdeath['mean_g'], [33, 66, 99], interpolation='nearest')
print(growth_rate_percentiles)

# Prepare data for Kaplan-Meier estimator for each percentile range
kmf = KaplanMeierFitter()

# Collect all durations and event observations
all_durations = []
all_event_observed = []
all_labels = []

colors = ['green', 'orange', 'red']  # Define colors for each group
for i, percentile in enumerate(growth_rate_percentiles):
    if i == 0:
        mask = gradesdeath['mean_g'] <= percentile
        label = f'<= {percentile:.3f}'
    elif i == len(growth_rate_percentiles) - 1:
        mask = gradesdeath['mean_g'] > growth_rate_percentiles[i - 1]
        label = f'> {growth_rate_percentiles[i - 1]:.3f}'
    else:
        mask = (gradesdeath['mean_g'] > growth_rate_percentiles[i - 1]) & (gradesdeath['mean_g'] <= percentile)
        label = f'{growth_rate_percentiles[i - 1]:.3f}-{percentile:.3f}'
        
    if mask.any():  # Check if there are any data points in this range
        durations = (gradesdeath['end_date'] - gradesdeath['Date_diagnosis (Stefano)']).dt.days[mask]
        event_observed = gradesdeath['overleden'][mask]
        
        kmf.fit(durations=durations, event_observed=event_observed, label=label)
        kmf.plot(color=colors[i])
    
        # Collect data for pairwise log-rank test
        all_durations.extend(durations)
        all_event_observed.extend(event_observed)
        all_labels.extend([label] * len(durations))

# Convert collected data to pandas Series
all_durations = pd.Series(all_durations)
all_event_observed = pd.Series(all_event_observed)
all_labels = pd.Series(all_labels)

# Perform pairwise log-rank test
results = pairwise_logrank_test(all_durations, all_labels, event_observed=all_event_observed)
print(results)

# Plot settings
plt.title('Kaplan-Meier Survival Curve by Growth Rate Percentile mean g')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.grid(True)
plt.legend(title='Growth Rate Percentile')
plt.show()


# Growth rates vs Grades - Non-treatment

In [None]:
tumorgrowth = pd.read_csv('results_groupedlesions_nt.csv')
# tumorgrowth = tumorgrowth[(tumorgrowth['d'].isna()) & (tumorgrowth['phi'].isna())]
tumorgrowth = tumorgrowth.dropna(subset=['g'])
tumorgrowth

In [None]:
# dropping outliers
max_value_index = tumorgrowth['g'].idxmax()
# Drop the row with the maximum value
tumorgrowth = tumorgrowth.drop(index=max_value_index)
tumorgrowth

In [None]:
tumordata = pd.read_excel('W_23_5814_2024-03-06.xlsx', sheet_name='Tumor')
tumorgrades = tumordata[['PATIENTNR', 'TRTU_Morfologie_oms']]

# # use anonymization keys to replace patient IDs
keys = keys.loc[~(keys['Patient ID'] == '20401820xxx187484')]
keys['Patient ID'] = keys['Patient ID'].astype(int)
tumorgrades = pd.merge(tumorgrades, keys, left_on='PATIENTNR', right_on='Patient ID', how='left')
tumorgrades = tumorgrades.drop(columns=['PATIENTNR', 'Patient ID'])
tumorgrades.rename(columns={'Anonymized Name': 'Case.Patient'}, inplace=True)

# Split the morfologie column into 'type' and 'tumorgrade'
tumorgrades[['type', 'tumorgrade']] = tumorgrades['TRTU_Morfologie_oms'].str.split(', ', expand=True)

# Drop the original 'full_name' column if you no longer need it
tumorgrades.drop(columns=['TRTU_Morfologie_oms'], inplace=True)
tumorgrades

In [None]:
# Set the style of the visualization
sns.set(style="whitegrid")

# Create a histogram with Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(tumorgrowth['g'], bins=100, kde=True, color='blue', edgecolor='black')

# Adding title and labels
plt.title('Distribution of Tumor Growth Rates of all the individual lesions', fontsize=16)
plt.xlabel('Tumor Growth Rate', fontsize=14)
plt.ylabel('Number of Lesions', fontsize=14)

# Show plot
plt.show()

## Validation: Growth Rates merged with Tumor Grades

In [None]:
growthrates = dfnotreatment[['Case.Patient', 'Lesion.ID']].merge(tumorgrowth, left_on='Lesion.ID', right_on='name', how='left')
growthrates = growthrates.drop(columns=['name', 'N', 'type', 'selectedFit']).dropna(subset=['g'])
# growthrates = growthrates.groupby('Lesion.ID')[['g', 'd', 'phi']].mean().reset_index()

# #merging growthrates with the grades
gandgrade = growthrates.merge(tumorgrades[['Case.Patient', 'tumorgrade']], on='Case.Patient', how='left')
gandgrade = gandgrade.drop_duplicates(subset=['Lesion.ID'])

# Remove white space from the 'Category' column
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].str.strip()
# Replace values for graad 3
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].replace({'NNO': 'graad 3'})
gandgrade


In [None]:
# Calculate mean growth for each group
mean_by_grade = gandgrade.groupby('tumorgrade')['g'].mean()
print(mean_by_grade)

# Set the style of the visualization
sns.set(style="whitegrid")

# Custom color palette
palette = {
    'graad 1': 'green',
    'graad 2': 'orange',
    'graad 3': 'red'
}

# Create a histogram with Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(
    gandgrade, 
    x='g', 
    hue='tumorgrade', 
    kde=True, 
    multiple='stack', 
    bins=100, 
    edgecolor='black', 
    palette=palette
)

# Adding title and labels
plt.title('Distribution of Tumor Growth Rates individual lesions non-treatment intervals', fontsize=16)
plt.xlabel('Tumor Growth Rate', fontsize=14)
plt.ylabel('Number of patients', fontsize=14)


plt.show()

In [None]:
# Create the boxplot
# custom color palette
palette = {
    'graad 1': 'green',
    'graad 2': 'orange',
    'graad 3': 'red'
}

sns.boxplot(data=gandgrade, x='tumorgrade', y='g', palette=palette)

# Adding title and labels
plt.title('Distribution of Tumor Growth Rates individual lesions non-treatment intervals', fontsize=16)
plt.xlabel('Tumor Grade', fontsize=14)
plt.ylabel('Tumor Growth Rate', fontsize=14)

# Growth rates vs Grades - Treatment Interval

In [None]:
tumorgrowth = pd.read_csv('results_lesions_t.csv')
tumorgrowth.shape

## Validation: Growth Rates merged with Tumor Grades

In [None]:
growthrates = dftreatment[['Case.Patient', 'Lesion.ID']].merge(tumorgrowth, left_on='Lesion.ID', right_on='name', how='left')
growthrates = growthrates.drop(columns=['name', 'N', 'type', 'selectedFit'])

# #merging growthrates with the grades
gandgrade = growthrates.merge(tumorgrades[['Case.Patient', 'tumorgrade']], on='Case.Patient', how='left')
gandgrade = gandgrade.drop_duplicates(subset=['Lesion.ID'])
# Remove white space from the 'Category' column
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].str.strip()
# Replace values for graad 3
gandgrade['tumorgrade'] = gandgrade['tumorgrade'].replace({'NNO': 'graad 3'})
gandgrade

In [None]:
# Calculate mean growth rate for each group
mean_growth_by_grade = gandgrade.groupby('tumorgrade')['g'].mean()
print(mean_growth_by_grade)

# Custom color palette
palette = {
    'graad 1': 'green',
    'graad 2': 'orange',
    'graad 3': 'red'
}

# Create a histogram with Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(
    gandgrade, 
    x='g', 
    hue='tumorgrade', 
    kde=True, 
    multiple='stack', 
    bins=20, 
    edgecolor='black', 
    palette=palette
)

# Adding title and labels
plt.title('Distribution of Tumor Growth Rates individual lesions treatment intervals', fontsize=16)
plt.xlabel('Tumor Growth Rate', fontsize=14)
plt.ylabel('Number of patients', fontsize=14)


plt.show()


In [None]:
# Create the boxplot
sns.boxplot(data=gandgrade, x='tumorgrade', y='g', palette=palette)

# Adding title and labels
plt.title('Distribution of Tumor Growth Rates individual lesions in treatment interval', fontsize=16)
plt.xlabel('Tumor Grade', fontsize=14)
plt.ylabel('Tumor Growth Rate', fontsize=14)

In [None]:
# Calculate mean decay for each group
mean_decrease_by_grade = gandgrade.groupby('tumorgrade')['d'].mean()
print(mean_decrease_by_grade)

# custom color palette
palette = {
    'graad 1': 'green',
    'graad 2': 'orange',
    'graad 3': 'red'
}

# Create a histogram with Seaborn
plt.figure(figsize=(10, 6))
sns.histplot(
    gandgrade, 
    x='d', 
    hue='tumorgrade', 
    kde=True, 
    multiple='stack', 
    bins=20, 
    edgecolor='black', 
    palette=palette
)

# Adding title and labels
plt.title('Distribution of Tumor Decay Rates individual lesions treatment intervals', fontsize=16)
plt.xlabel('Tumor Growth Rate', fontsize=14)
plt.ylabel('Number of patients', fontsize=14)

plt.show()
