## create all combinations of treatments 

In [12]:
import pandas as pd
import os 

def read_data(file_path, sheet_name, save_path):

    # Read the specific sheet into a DataFrame
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    ## drop unnecessary columns 

    columns_to_drop = ['Mouse_ID', 'Tumor_Side', 'Age']
    df = df.drop(columns=columns_to_drop)

    # Specify the treatment type you want to filter by
    treatment_type = [0,1,11,10]  

    # Filter the DataFrame based on the treatment column
    sham_df = df[df['Treatment'] == treatment_type[0]]
    sham_df = sham_df.drop(columns= 'Treatment')

    ir_df = df[df['Treatment'] == treatment_type[1]]
    ir_df = ir_df.drop(columns= 'Treatment')

    aspirin_df = df[df['Treatment'] == treatment_type[2]]
    aspirin_df = aspirin_df.drop(columns= 'Treatment')

    ir_aspirin_df = df[df['Treatment'] == treatment_type[3]]
    ir_aspirin_df = ir_aspirin_df.drop(columns= 'Treatment')
    # Show the filtered DataFrame
    # print(sham_df.head())

    # Save column names to a separate file
    columns_path = os.path.join(save_path, "columns.txt")
    with open(columns_path, 'w') as f:
        f.write(','.join(df.columns))

    return sham_df, ir_df, aspirin_df, ir_aspirin_df


def combine_dfs(sham_df, ir_df, aspirin_df, ir_aspirin_df, save_path):
    # Store DataFrames in a dictionary for better management
    dfs = {
        'sham': sham_df,
        'ir': ir_df,
        'aspirin': aspirin_df,
        'ir+aspirin': ir_aspirin_df
    }

    # Empty list to store results
    merged_dataframes = []

    # Iterate over pairs of DataFrame names and DataFrames
    for name1, df1 in dfs.items():
        for name2, df2 in dfs.items():
            if name1 < name2:  # This check avoids repeating pairs and self-merging
                # Add a new column to each DataFrame to indicate the origin (0 or 1)
                df1_modified = df1.copy()
                df1_modified['Origin'] = 0  # 0 for rows from the first DataFrame
                df1_modified = df1_modified[['Origin'] + [col for col in df1_modified.columns if col != 'Origin']]
                
                df2_modified = df2.copy()
                df2_modified['Origin'] = 1  # 1 for rows from the second DataFrame
                df2_modified = df2_modified[['Origin'] + [col for col in df2_modified.columns if col != 'Origin']]
                
                # Merge the pair of modified DataFrames
                merged_df = pd.concat([df1_modified, df2_modified])
                
                # Store the merged DataFrame with a descriptive name
                merged_dataframes.append((f"{name1}_{name2}", merged_df))

    # Output the results and optionally save them
    for name, df in merged_dataframes:
        # Save DataFrame to CSV without headers
        full_path = os.path.join(save_path, f"{name}.csv")
        df.to_csv(full_path, index=False, header=False)
        print(f"Saved DataFrame '{name}' to '{full_path}'")




# Load the Excel file
file_path = '.\\raw_data\Final_Spreadsheet_separated_by_age.xlsx'
sheet_name = 'Age_W5'  # Replace with your actual sheet name
save_path = '.\\data\output_preprocess'
## creating the source dfs for sham, IR, Aspirin and IR+Aspirin. 
sham_df, ir_df, aspirin_df, ir_aspirin_df = read_data(file_path, sheet_name, save_path)

# Usage example:
combine_dfs(sham_df, ir_df, aspirin_df, ir_aspirin_df, save_path)


Saved DataFrame 'ir_sham' to '.\data\output_preprocess\ir_sham.csv'
Saved DataFrame 'ir_ir+aspirin' to '.\data\output_preprocess\ir_ir+aspirin.csv'
Saved DataFrame 'aspirin_sham' to '.\data\output_preprocess\aspirin_sham.csv'
Saved DataFrame 'aspirin_ir' to '.\data\output_preprocess\aspirin_ir.csv'
Saved DataFrame 'aspirin_ir+aspirin' to '.\data\output_preprocess\aspirin_ir+aspirin.csv'
Saved DataFrame 'ir+aspirin_sham' to '.\data\output_preprocess\ir+aspirin_sham.csv'


In [24]:
print(len(sham_df),len(ir_df), len(aspirin_df), len(ir_aspirin_df))
print(len(sham_df.columns))

12 10 10 10
29


## adding the names of columns to output of r code 

In [18]:
import pandas as pd
import os

def add_feature_names_csv(names_file_path, csv_directory_path, save_directory_path):
    # Read names from the file into a list
    with open(names_file_path, 'r') as file:
        names = [line.strip() for line in file]

    # Read CSV files from directory
    csv_files = [f for f in os.listdir(csv_directory_path) if f.endswith('.csv')]

    # Function to map indices to names
    def index_to_name(index):
        # Check if index is an integer and within the valid range
        if isinstance(index, int) and 1 <= index <= len(names):
            return names[index - 1]  # Convert 1-based index to 0-based
        else:
            return 'Index out of range'

    # Process each CSV file
    for csv_file in csv_files:
        # Read the current CSV file
        current_table = pd.read_csv(csv_directory_path + csv_file)
        # Convert 'V1' to integers, handling non-convertible values by coercing to NaN
        current_table['V1'] = pd.to_numeric(current_table['V1'], errors='coerce').fillna(0).astype(int)
        # Map the 'V1' column to names using the provided function
        current_table['Names'] = current_table['V1'].apply(index_to_name)

        # Save the modified table
        modified_file_path = os.path.join(save_directory_path, 'modified_' + csv_file)
        current_table.to_csv(modified_file_path, index=False)
        print(f"Processed and saved modified data to {modified_file_path}")

# Example usage of the function:
names_file_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\column_names.csv'
csv_directory_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\output_r\\'
save_directory_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\output_r_joint_features\\'

add_feature_names_csv(names_file_path, csv_directory_path, save_directory_path)


Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_modified_output_aspirin_ir+aspirin.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_aspirin_ir+aspirin.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_aspirin_ir.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_aspirin_sham.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_ir+aspirin_sham.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_ir_ir+aspirin.csv
Processed and saved modified data to C:\Shabani\Projects\tumor_latency\data\output_r_joint_features\modified_output_ir_sham.csv


## aggregating the p-values 

In [34]:
import pandas as pd
import os

# Set the directory path for the CSV files
csv_directory_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\output_r_joint_features\\'

# Set the path for the names
names_path = "C:\\Shabani\\Projects\\tumor_latency\\data\\column_names.csv"

# List all CSV files in the directory
csv_files = [f for f in os.listdir(csv_directory_path) if f.endswith('.csv')]

# Read the first p-value table
current_table = pd.read_csv(csv_directory_path + csv_files[0])

# Read the names from the column table
names = pd.read_csv(names_path)

# Extract the base name without the '.csv' extension for use in naming the column
table_name = csv_files[0].replace('.csv', '')

# Merge the names DataFrame with the current table on the 'Names' column
result_df = pd.merge(names, current_table[['Names', 'V7']], on='Names', how='left')

# Rename the 'V7' column to include the name of the current CSV file
result_df = result_df.rename(columns={'V7': f'{table_name[16:]}'})

# Display the result
print(result_df)


                                Names  aspirin_ir+aspirin
0                          K14+_Freq.                 NaN
1                          K18+_Freq.        3.616800e-42
2                   Lymphocytes_Freq.        3.413917e-57
3                         CD3+_Lympho        1.507059e-35
4                         CD8+_Lympho       1.545586e-267
5               Double_Negative_Freq.                 NaN
6        cells_with_lymphos_neighbors                 NaN
7     cells_with_othercells_neighbors                 NaN
8   equal_neighbors(lympho_and_other)        0.000000e+00
9                          Small_area                 NaN
10                         Large_area                 NaN
11                   Pleomorphism_Low                 NaN
12                  Pleomorphism_High                 NaN
13              Crircular_cells_Freq.        0.000000e+00
14              Elongated_cells_Freq.                 NaN
15               low_mean_DAPI_signal        0.000000e+00
16            

In [28]:
names

Unnamed: 0,Names
0,K14+_Freq.
1,K18+_Freq.
2,Lymphocytes_Freq.
3,CD3+_Lympho
4,CD8+_Lympho
5,Double_Negative_Freq.
6,cells_with_lymphos_neighbors
7,cells_with_othercells_neighbors
8,equal_neighbors(lympho_and_other)
9,Small_area


In [29]:
current_table

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,Names
0,2,-10.252017,0.753412,-11.728678,-8.775356,185.162493,3.6167999999999995e-42,K18+_Freq.
1,3,9.539266,0.598497,8.366233,10.712299,254.0421,3.413917e-57,Lymphocytes_Freq.
2,4,-324.361893,26.065665,-375.449658,-273.274128,154.853858,1.507059e-35,CD3+_Lympho
3,5,55.023312,1.574572,51.937208,58.109415,1221.148869,1.5455860000000002e-267,CD8+_Lympho
4,9,-151.14668,2.965075,-156.958121,-145.33524,2598.518462,0.0,equal_neighbors(lympho_and_other)
5,14,62.936919,0.820099,61.329555,64.544284,5889.500704,0.0,Crircular_cells_Freq.
6,16,-28.185069,0.457653,-29.082053,-27.288085,3792.847741,0.0,low_mean_DAPI_signal
7,17,0.042615,0.618667,-1.169949,1.255179,0.004745,0.9450834,high_mean_DAPI_signal
8,20,-9.132828,0.724025,-10.551892,-7.713765,159.11206,1.768707e-36,pSMAD+_Freq.
9,21,-49.288821,1.500596,-52.229935,-46.347707,1078.870726,1.29197e-236,Ki67+_Freq.


In [38]:
import pandas as pd
import os

# Set the directory path for the CSV files
csv_directory_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\output_r_joint_features\\'

## saved path 
saved_path = 'C:\\Shabani\\Projects\\tumor_latency\\data\\analysis\\'
# Set the path for the names
names_path = "C:\\Shabani\\Projects\\tumor_latency\\data\\column_names.csv"

# Read the names from the column table
names = pd.read_csv(names_path)

# List all CSV files in the directory
csv_files = [f for f in os.listdir(csv_directory_path) if f.endswith('.csv')]

# Initialize the result DataFrame with names
result_df = names.copy()

# Loop through each file in the csv_files list
for csv_file in csv_files:
    # Read the current p-value table
    current_table = pd.read_csv(os.path.join(csv_directory_path, csv_file))
    
    # Extract the base name without the '.csv' extension for use in naming the column
    table_name = csv_file.replace('.csv', '')
    
    # Merge the names DataFrame with the current table on the 'Names' column
    temp_df = pd.merge(names[['Names']], current_table[['Names', 'V7']], on='Names', how='left')
    
    # Rename the 'V7' column to include the name of the current CSV file, slicing from index 16 if needed
    temp_df = temp_df.rename(columns={'V7': f'{table_name[16:]}'})

    # Add the new column to the result DataFrame
    result_df[f'{table_name[16:]}'] = temp_df[f'{table_name[16:]}']

# Display the result
print(result_df)

# Specify the file path where the result should be saved
save_result_path = os.path.join(saved_path, 'analysis_w5.csv')

# Save the result DataFrame to a CSV file
result_df.to_csv(save_result_path, index=False)

                                Names  aspirin_ir+aspirin  aspirin_ir  \
0                          K14+_Freq.                 NaN         NaN   
1                          K18+_Freq.        3.616800e-42         NaN   
2                   Lymphocytes_Freq.        3.413917e-57         NaN   
3                         CD3+_Lympho        1.507059e-35         NaN   
4                         CD8+_Lympho       1.545586e-267         NaN   
5               Double_Negative_Freq.                 NaN         NaN   
6        cells_with_lymphos_neighbors                 NaN    0.577677   
7     cells_with_othercells_neighbors                 NaN         NaN   
8   equal_neighbors(lympho_and_other)        0.000000e+00         NaN   
9                          Small_area                 NaN         NaN   
10                         Large_area                 NaN    0.318564   
11                   Pleomorphism_Low                 NaN    0.003889   
12                  Pleomorphism_High              