In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
###here I remove too small datasets with 27k sites and filter the remain data to include only those sites which located in all datasets
import os
import pandas as pd
from tqdm import tqdm
import sys

def process_feather_files(input_folder, output_folder, min_columns=28000):
    feather_files = [f for f in os.listdir(input_folder) if f.endswith('.feather')]
    valid_files = []
    
    print("Filtering valid files...")
    for file in tqdm(feather_files, desc="Filtering valid files", file=sys.stdout):
        file_path = os.path.join(input_folder, file)
        df = pd.read_feather(file_path)
        if df.shape[1] >= min_columns:
            valid_files.append(file_path)

    if not valid_files:
        print("No valid files found with the minimum column requirement.")
        return

    common_columns = None
    print("Finding common columns across valid files...")
    for file_path in tqdm(valid_files, desc="Finding common columns", file=sys.stdout):
        df = pd.read_feather(file_path)
        if common_columns is None:
            common_columns = set(df.columns)
        else:
            common_columns.intersection_update(df.columns)

    if not common_columns:
        print("No common columns found. Exiting.")
        return

    print("Processing and saving files...")
    for file_path in tqdm(valid_files, desc="Processing files", file=sys.stdout):
        df = pd.read_feather(file_path)
        df_common = df[list(common_columns)]
        
        output_path = os.path.join(output_folder, os.path.basename(file_path))
        df_common.to_feather(output_path)
        print(f"Processed and saved: {output_path}")

input_folder = "/home/vpal/hobotnica/All_datasets/datasets/data_feather_no_age_big_ds"
output_folder = "/home/vpal/hobotnica/All_datasets/datasets/feather_ds_no_age_39_ds_only_interception"
process_feather_files(input_folder, output_folder)

Filtering valid files...
Filtering valid files: 100%|██████████| 41/41 [06:10<00:00,  9.03s/it]
Finding common columns across valid files...
Finding common columns: 100%|██████████| 39/39 [06:14<00:00,  9.60s/it]
Processing and saving files...
Processing files:   0%|          | 0/39 [00:00<?, ?it/s]Processed and saved: /home/vpal/hobotnica/All_datasets/datasets/feather_ds_no_age_39_ds_only_interception/GSE134429.feather
Processing files:   3%|▎         | 1/39 [00:30<19:25, 30.68s/it]Processed and saved: /home/vpal/hobotnica/All_datasets/datasets/feather_ds_no_age_39_ds_only_interception/GSE131752.feather
Processing files:   5%|▌         | 2/39 [01:01<19:03, 30.90s/it]Processed and saved: /home/vpal/hobotnica/All_datasets/datasets/feather_ds_no_age_39_ds_only_interception/GSE87640.feather
Processing files:   8%|▊         | 3/39 [01:27<16:59, 28.33s/it]Processed and saved: /home/vpal/hobotnica/All_datasets/datasets/feather_ds_no_age_39_ds_only_interception/GSE217633.feather
Processing fi

In [None]:
# import os
# import pandas as pd

# def extract_unique_sites(file_path):
#     try:
#         df = pd.read_csv(file_path, sep='\t', usecols=[0, 1, 2], engine='c', low_memory=False)
#         return set(df.iloc[:, 0]).union(set(df.iloc[:, 1]), set(df.iloc[:, 2]))
#     except Exception as e:
#         print(f"Error processing {file_path}: {e}")
#         return set()

# def find_common_sites(folder_path, output_file, threshold=1.0):
#     files = [f for f in os.listdir(folder_path)]

#     site_counts = {}
#     total_files = len(files)

#     for filename in files:
#         file_path = os.path.join(folder_path, filename)
#         unique_sites = extract_unique_sites(file_path)
        
#         for site in unique_sites:
#             if site in site_counts:
#                 site_counts[site] += 1
#             else:
#                 site_counts[site] = 1

#         print(f"Processed {filename}, current unique sites: {len(site_counts)}")

#     min_occurrences = int(threshold * total_files)

#     common_sites = [site for site, count in site_counts.items() if count >= min_occurrences]
#     if common_sites:
#         with open(output_file, 'w') as f:
#             for site in sorted(common_sites):
#                 f.write(f"{site}\n")
#         print(f"Common sites found in at least {threshold*100}% of datasets saved to {output_file}")
#     else:
#         print(f"No common sites found in at least {threshold*100}% of datasets.")

# folder_path = '/home/vpal/hobotnica/All_datasets/triplets_h_score_bigger_than_0.5'
# output_file = '/home/vpal/hobotnica/All_datasets/analysis_of_triplets_res/common_sites_threshold.txt'

# find_common_sites(folder_path, output_file, threshold=0.7)

Processed GSE118469, current unique sites: 170691
Processed GSE59685, current unique sites: 238824
Processed GSE111629, current unique sites: 296757
Processed GSE134429, current unique sites: 301989
Processed GSE32148, current unique sites: 307124
Processed GSE118468, current unique sites: 317649
Processed GSE56581, current unique sites: 318620
Processed GSE130030, current unique sites: 321960
Processed GSE214297, current unique sites: 323269
Processed GSE42861, current unique sites: 323698
Processed GSE193836, current unique sites: 328408
Processed GSE87640, current unique sites: 328413
Processed GSE143942, current unique sites: 328414
Processed GSE107143, current unique sites: 332075
Processed GSE56606, current unique sites: 332184
Processed GSE77696, current unique sites: 335328
Processed GSE72338, current unique sites: 335916
Processed GSE106648, current unique sites: 335916
Processed GSE72774, current unique sites: 335966
Processed GSE99624, current unique sites: 336403
Processed 

In [18]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# def visualize_overlap_matrix(csv_file, output_image):
#     """Visualize the overlap matrix as a heatmap without annotations."""
#     # Load the overlap matrix from the CSV file
#     overlap_matrix = pd.read_csv(csv_file, index_col=0)
    
#     # Create a heatmap using Seaborn without annotations
#     plt.figure(figsize=(18, 15))
#     sns.heatmap(
#         overlap_matrix, 
#         annot=False,    # Disable annotations
#         cmap='viridis', 
#         linewidths=0.3, 
#         square=True, 
#         cbar_kws={'label': 'Number of Common Columns'}
#     )
    
#     # Add titles and labels
#     plt.title("Column Overlap Between Datasets", fontsize=16)
#     plt.xlabel("Datasets")
#     plt.ylabel("Datasets")
    
#     # Rotate x-axis labels for better readability
#     plt.xticks(rotation=90)
#     plt.yticks(rotation=0)
    
#     # Save the heatmap as an image file
#     plt.savefig(output_image, format='png', dpi=300, bbox_inches='tight')
#     plt.show()

# # Define the paths to your CSV file and the output image
# csv_file = '/home/vpal/hobotnica/All_datasets/analysis_of_triplets_res/overlap_matrix.csv'
# output_image = '/home/vpal/hobotnica/All_datasets/analysis_of_triplets_res/overlap_heatmap_no_numbers.png'

# # Visualize the overlap matrix
# visualize_overlap_matrix(csv_file, output_image)
# print(f"Heatmap saved as {output_image}")

In [43]:
# import os
# import pandas as pd
# from collections import defaultdict

# def count_site_occurrences(folder_path):
#     site_counts = defaultdict(int)

#     for filename in os.listdir(folder_path):
#         file_path = os.path.join(folder_path, filename)
#         df = pd.read_csv(file_path, sep='\t', engine='python')

#         all_sites = set(df['Column1']).union(set(df['Column2']), set(df['Column3']))
#         for site in all_sites:
#             site_counts[site] += 1

#     return site_counts

# def filter_rows_by_site_count(folder_path, site_counts, min_threshold=20):

#     for filename in os.listdir(folder_path):
#         file_path = os.path.join(folder_path, filename)
#         df = pd.read_csv(file_path, sep='\t', engine='python')
        
#         filtered_df = df[
#             (df['Column1'].apply(lambda x: site_counts[x] >= min_threshold)) &
#             (df['Column2'].apply(lambda x: site_counts[x] >= min_threshold)) &
#             (df['Column3'].apply(lambda x: site_counts[x] >= min_threshold))
#         ]

#         filtered_df.to_csv(file_path, sep='\t', index=False)

# folder_path = '/home/vpal/hobotnica/All_datasets/triplets_h_score_bigger_than_0.5'

# site_counts = count_site_occurrences(folder_path)

# filter_rows_by_site_count(folder_path, site_counts)


In [None]:
# import os
# import pandas as pd

# def filter_files(input_folder, output_folder):
#     os.makedirs(output_folder, exist_ok=True)
    
#     # Iterate through all .txt files in the input folder
#     for filename in os.listdir(input_folder):
#         input_path = os.path.join(input_folder, filename)
        
#         # Read the file into a DataFrame
#         df = pd.read_csv(input_path, sep='\s+', engine='python')
        
#         # Filter out rows where the h-score (4th column) is less than 0.5
#         filtered_df = df[df['Result'] >= 0.5]
        
#         # Save the filtered DataFrame to the output folder
#         output_path = os.path.join(output_folder, filename)
#         filtered_df.to_csv(output_path, sep='\t', index=False)
#         print(f"Processed {filename}, saved to {output_path}")

# # Define the input and output folders
# input_folder = '/home/vpal/hobotnica/All_datasets/H_scores_triplets'
# output_folder = '/home/vpal/hobotnica/All_datasets/triplets_h_score_bigger'

# # Run the function
# filter_files(input_folder, output_folder)

Processed GSE118469, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE118469
Processed GSE59685, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE59685
Processed GSE111629, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE111629
Processed GSE134429, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE134429
Processed GSE32148, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE32148
Processed GSE118468, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE118468
Processed GSE56581, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE56581
Processed GSE130030, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE130030
Processed GSE214297, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE214297
Processed GSE42861, saved to /home/vpal/hobotnica/All_datasets/triplets_h_score_bigger/GSE42861
Processed GSE193836, saved t

In [3]:
import os
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import itertools
from tqdm import tqdm


def get_common_columns(datasets):
    common_columns = set(datasets[0])
    for columns in datasets[1:]:
        common_columns.intersection_update(columns)
    return common_columns

def get_all_columns(datasets):
    all_columns = set()
    for columns in datasets:
        all_columns.update(columns)
    return all_columns

def analyze_column_intersections(folder_path):
    datasets = {}
    files = [f for f in os.listdir(folder_path) if f.endswith('.feather')]
    
    for filename in tqdm(files, desc="Processing files", unit="file"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_feather(file_path)
        datasets[filename] = set(df.columns)
        print(f"Processed: {filename} with {len(df.columns)} columns")

    dataset_names = list(datasets.keys())
    dataset_columns = list(datasets.values())

    common_columns = get_common_columns(dataset_columns)
    all_columns = get_all_columns(dataset_columns)
    
    print(f"\nCommon Columns across all datasets: {len(common_columns)}")
    print(f"Unique Columns across all datasets: {len(all_columns)}")

    with open('column_analysis.txt', 'w') as f:
        f.write(f"Common Columns across all datasets: {len(common_columns)}\n")
        f.write(f"Unique Columns across all datasets: {len(all_columns)}\n")
        f.write("\nList of common columns:\n")
        f.write(", ".join(common_columns))
        f.write("\n\nList of all columns:\n")
        f.write(", ".join(all_columns))

    overlap_matrix = pd.DataFrame(
        index=dataset_names, columns=dataset_names, dtype=int
    )
    print("\nCalculating column overlap...")
    
    for (name1, cols1), (name2, cols2) in tqdm(itertools.combinations(datasets.items(), 2), 
                                               desc="Calculating overlaps", unit="pair"):
        overlap = len(cols1.intersection(cols2))
        overlap_matrix.loc[name1, name2] = overlap
        overlap_matrix.loc[name2, name1] = overlap

    plt.figure(figsize=(12, 10))
    plt.title("Column Overlap between Datasets")
    plt.imshow(overlap_matrix.fillna(0), cmap="viridis", interpolation="nearest")
    plt.colorbar(label="Number of Common Columns")
    plt.xticks(range(len(dataset_names)), dataset_names, rotation=90)
    plt.yticks(range(len(dataset_names)), dataset_names)
    plt.tight_layout()
    plt.savefig('column_overlap.pdf')
    plt.close()

    print("\nAnalysis completed. Results saved as 'column_analysis.txt' and 'column_overlap.pdf'.")
    return overlap_matrix, common_columns, all_columns

folder_path = '/home/vpal/hobotnica/All_datasets/data_feather_no_age_big_ds'
overlap_matrix, common_columns, all_columns = analyze_column_intersections(folder_path)

Processing files:   0%|          | 0/41 [00:00<?, ?file/s]

Processing files:   2%|▏         | 1/41 [00:13<09:14, 13.87s/file]

Processed: GSE134429.feather with 766019 columns


Processing files:   5%|▍         | 2/41 [00:28<09:17, 14.31s/file]

Processed: GSE131752.feather with 816982 columns


Processing files:   7%|▋         | 3/41 [00:37<07:29, 11.84s/file]

Processed: GSE87640.feather with 485482 columns


Processing files:  10%|▉         | 4/41 [00:53<08:21, 13.54s/file]

Processed: GSE217633.feather with 835168 columns


Processing files:  12%|█▏        | 5/41 [01:02<07:05, 11.82s/file]

Processed: GSE87648.feather with 460305 columns


Processing files:  15%|█▍        | 6/41 [01:10<06:15, 10.72s/file]

Processed: GSE81961.feather with 485417 columns


Processing files:  17%|█▋        | 7/41 [01:19<05:42, 10.07s/file]

Processed: GSE59685.feather with 485482 columns


Processing files:  20%|█▉        | 8/41 [01:27<05:06,  9.28s/file]

Processed: GSE144858.feather with 410891 columns


Processing files:  22%|██▏       | 9/41 [01:35<04:50,  9.09s/file]

Processed: GSE99624.feather with 485478 columns


Processing files:  24%|██▍       | 10/41 [01:44<04:39,  9.00s/file]

Processed: GSE118468.feather with 485479 columns


Processing files:  27%|██▋       | 11/41 [01:53<04:30,  9.02s/file]

Processed: GSE106648.feather with 485417 columns


Processing files:  29%|██▉       | 12/41 [02:02<04:21,  9.02s/file]

Processed: GSE111223.feather with 485479 columns


Processing files:  32%|███▏      | 13/41 [02:11<04:11,  8.96s/file]

Processed: GSE67751.feather with 485417 columns


Processing files:  34%|███▍      | 14/41 [02:27<04:56, 10.99s/file]

Processed: GSE219293.feather with 865768 columns


Processing files:  37%|███▋      | 15/41 [02:42<05:19, 12.28s/file]

Processed: GSE122244.feather with 822800 columns


Processing files:  39%|███▉      | 16/41 [02:51<04:41, 11.25s/file]

Processed: GSE130029.feather with 469664 columns


Processing files:  41%|████▏     | 17/41 [03:00<04:15, 10.64s/file]

Processed: GSE77696.feather with 485417 columns


Processing files:  44%|████▍     | 18/41 [03:09<03:51, 10.06s/file]

Processed: GSE130030.feather with 471338 columns


Processing files:  46%|████▋     | 19/41 [03:21<03:54, 10.65s/file]

Processed: GSE42861.feather with 485482 columns


Processing files:  49%|████▉     | 20/41 [03:29<03:25,  9.80s/file]

Processed: GSE193836.feather with 414249 columns


Processing files:  51%|█████     | 21/41 [03:37<03:09,  9.46s/file]

Processed: GSE71841.feather with 485482 columns


Processing files:  54%|█████▎    | 22/41 [03:53<03:37, 11.43s/file]

Processed: GSE145714.feather with 865999 columns


Processing files:  56%|█████▌    | 23/41 [04:02<03:11, 10.64s/file]

Processed: GSE72776.feather with 485417 columns


Processing files:  59%|█████▊    | 24/41 [04:11<02:50, 10.04s/file]

Processed: GSE72338.feather with 485476 columns


Processing files:  61%|██████    | 25/41 [04:20<02:38,  9.88s/file]

Processed: GSE72774.feather with 485417 columns


Processing files:  63%|██████▎   | 26/41 [04:29<02:23,  9.59s/file]

Processed: GSE175364.feather with 485417 columns


Processing files:  66%|██████▌   | 27/41 [04:38<02:11,  9.38s/file]

Processed: GSE118469.feather with 485481 columns


Processing files:  68%|██████▊   | 28/41 [04:46<01:54,  8.84s/file]

Processed: GSE166611.feather with 409889 columns


Processing files:  71%|███████   | 29/41 [04:55<01:46,  8.87s/file]

Processed: GSE131989.feather with 407514 columns


Processing files:  73%|███████▎  | 30/41 [05:04<01:38,  8.94s/file]

Processed: GSE67705.feather with 485417 columns


Processing files:  76%|███████▌  | 31/41 [05:13<01:29,  8.94s/file]

Processed: GSE32148.feather with 485482 columns


Processing files:  78%|███████▊  | 32/41 [05:28<01:37, 10.85s/file]

Processed: GSE182991.feather with 825171 columns


Processing files:  80%|████████  | 33/41 [05:40<01:28, 11.07s/file]

Processed: GSE111629.feather with 485417 columns


Processing files:  83%|████████▎ | 34/41 [05:49<01:13, 10.46s/file]

Processed: GSE56581.feather with 485482 columns


Processing files:  85%|████████▌ | 35/41 [05:49<00:45,  7.51s/file]

Processed: GSE56606.feather with 27580 columns


Processing files:  88%|████████▊ | 36/41 [05:57<00:38,  7.74s/file]

Processed: GSE107143.feather with 460297 columns


Processing files:  90%|█████████ | 37/41 [06:08<00:34,  8.58s/file]

Processed: GSE56046.feather with 485482 columns


Processing files:  93%|█████████▎| 38/41 [06:16<00:25,  8.54s/file]

Processed: GSE156994.feather with 403356 columns


Processing files:  95%|█████████▌| 39/41 [06:17<00:12,  6.17s/file]

Processed: GSE49909.feather with 27580 columns


Processing files:  98%|█████████▊| 40/41 [06:26<00:06,  7.00s/file]

Processed: GSE143942.feather with 485417 columns


Processing files: 100%|██████████| 41/41 [06:36<00:00,  9.66s/file]

Processed: GSE214297.feather with 518476 columns






Common Columns across all datasets: 15300
Unique Columns across all datasets: 900354

Calculating column overlap...


Calculating overlaps: 820pair [01:01, 13.26pair/s]



Analysis completed. Results saved as 'column_analysis.txt' and 'column_overlap.pdf'.


In [2]:
# ###Diagrams Euler

# import os
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib_venn import venn2, venn3
# from upsetplot import UpSet
# from tqdm import tqdm

# def get_column_sets(folder_path):
#     datasets = {}
#     column_counts = []
    
#     files = [f for f in os.listdir(folder_path) if f.endswith('.feather')]
    
#     print("Reading datasets...")
#     for filename in tqdm(files, desc="Processing files", unit="file"):
#         file_path = os.path.join(folder_path, filename)
#         df = pd.read_feather(file_path)
#         column_count = len(df.columns)
#         datasets[filename] = set(df.columns)
#         column_counts.append({'Dataset': filename, 'Number of Columns': column_count})
    
#     # Convert the column count data to a DataFrame
#     column_counts_df = pd.DataFrame(column_counts)
#     column_counts_df.sort_values(by='Number of Columns', ascending=True, inplace=True)
    
#     # Save the table to a text file
#     column_counts_df.to_csv('column_counts.txt', index=False, sep='\t')
    
#     print("\nColumn counts saved to 'column_counts.txt'.")
#     return datasets, column_counts_df

# def draw_venn_diagram(datasets):
#     if len(datasets) == 2:
#         names, sets = list(datasets.keys()), list(datasets.values())
#         venn2(sets, set_labels=names)
#         plt.title("Venn Diagram of Dataset Column Overlaps")
#         plt.show()
#     elif len(datasets) == 3:
#         names, sets = list(datasets.keys()), list(datasets.values())
#         venn3(sets, set_labels=names)
#         plt.title("Venn Diagram of Dataset Column Overlaps")
#         plt.show()
#     else:
#         print("Venn diagrams are only supported for 2 or 3 sets. Consider using an UpSet plot.")

# def draw_upset_plot(datasets, max_categories=20):
#     """
#     Draw an UpSet plot for multiple datasets, limiting to the top `max_categories` intersections.
#     """
#     from collections import defaultdict
#     column_presence = defaultdict(list)
    
#     all_columns = set()
#     for columns in datasets.values():
#         all_columns.update(columns)
    
#     # Create a DataFrame where each row represents a column and each column is a dataset
#     df = pd.DataFrame(
#         {dataset: [col in columns for col in all_columns] 
#          for dataset, columns in datasets.items()},
#         index=list(all_columns)
#     )
    
#     # Group by the dataset columns to get intersection counts
#     upset_data = df.groupby(list(datasets.keys())).size()
    
#     # Limit the data to the top `max_categories` intersections
#     upset_data = upset_data.nlargest(max_categories)
   
#     plt.figure(figsize=(12, 6))
#     upset = UpSet(upset_data)
#     upset.plot()
#     plt.title("UpSet Plot of Dataset Column Overlaps (Top Intersections)")
#     plt.savefig('upset_plot.pdf', format='pdf')
#     plt.close()

    
# def visualize_overlaps(folder_path):
#     datasets, column_counts_df = get_column_sets(folder_path)
    
#     # Display the sorted column counts table
#     print("\nTable of Column Counts:")
#     print(column_counts_df)
    
#     # Save the table to a text file
#     with open('column_counts_sorted.txt', 'w') as f:
#         f.write(column_counts_df.to_string(index=False))
    
#     if len(datasets) <= 3:
#         draw_venn_diagram(datasets)
#     else:
#         draw_upset_plot(datasets)

# folder_path = '/home/vpal/hobotnica/All_datasets/data_feather_no_age_big_ds'
# visualize_overlaps(folder_path)

In [None]:
# ##removing repeats from txt files triplets
# import pandas as pd
# import os
# folder_path = '/home/vpal/hobotnica/All_datasets/H_scores_triplets_its_txt_format_not_feather'

# for filename in os.listdir(folder_path):
#     if filename.endswith('.feather'):
#         file_path = os.path.join(folder_path, filename)
#         data = pd.read_csv(file_path, delimiter='\t') 
#         data = data.drop_duplicates()
#         data.to_csv(file_path, sep='\t', index=False)

Duplicate rows removed from all files in the folder.


In [12]:
###save ds to feather format
import pandas as pd
import os
directory = "/home/vpal/hobotnica/All_datasets/no_age_multiple_regression_big_datasets"
output_path = "/home/vpal/hobotnica/All_datasets/data_feather"

files = os.listdir(directory)
for file in files:
    file_path = os.path.join(directory, file)  
    data = pd.read_pickle(file_path)
    base_name = os.path.splitext(file)[0]
    output_file_name = base_name + ".feather"
    output_file_path = os.path.join(output_path, output_file_name)
    data.to_feather(output_file_path)
    

In [21]:
#Get the final version of result table with H-score and pval
file1_path = '/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_H_scores_no_age.csv' #I change the name of files after result of the code
file2_path = '/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_H_scores_with_age_only.csv' #I change the name of files after result of the code

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

df12 = pd.merge(df2, df1, on="Dataset_ID", how='outer')
df12 = df12.rename(columns={"H_score_x" :"H_score", "H_score_y" :"H_score_no_age", "p_value_x": "p_value", "p_value_y": "p_value_no_age"})

df3 = pd.read_csv("/home/vpal/hobotnica/ds_info_for_res.csv")
final_df = pd.merge(df3, df12, on="Dataset ID", how='outer')

final_df.to_csv('/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_final_res.csv', index=False)

In [27]:
##round h_scores
columns_to_round = ['H_score', 'H_score_no_age'] 
final_df = pd.read_csv("/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_final_res.csv")
final_df["H_score"] = final_df["H_score"].round(3)
final_df["H_score_no_age"] = final_df["H_score_no_age"].round(3)
final_df.to_csv('/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_final_res.csv', index=False)


In [12]:
#sorting by column
final_df = pd.read_csv("/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_final_res.csv")
final_df = final_df.sort_values(by = "H_score_no_age", ascending=False) 
final_df.to_csv('/home/vpal/hobotnica/PhenoAgeV2_res_imputed_rand_signature/PhenoAgeV2_final_res.csv', index=False)

In [None]:
# ###Multiple linear regression with intercept subtraction

# def substract_age(file_path):
#     data = pd.read_csv(file_path)
#     data_1 = data.copy()

#     data_1['Condition'] = data_1['Condition'].apply(lambda x: 0 if x == 'HC' else 1)
    
#     for site in data_1.columns[1:-3]:
#         y_train = data_1[site] 
#         X_train = data_1[['Age', 'Condition']] 

#         model = LinearRegression()
#         model.fit(X_train, y_train)
        
#         age_coef = model.coef_[0] # Coefficient b1
#         intercept = model.intercept_
#         data_1[site] = data_1[site] - (age_coef * data_1['Age'] + intercept)
        
#     data_1["Condition"] = data["Condition"]
    
#     return data_1

# datasets = '/tank/projects/vpalagina_hobotnica/hobotnica/clocks/GrimAgeV1/datasets_GrimAgeV1'
# PhenoAge_mult_regr_age = '/tank/projects/vpalagina_hobotnica/hobotnica/clocks/GrimAgeV1/no_age'

# for filename in os.listdir(datasets):
#     file_path = os.path.join(datasets, filename)
#     processed_df = substract_age(file_path)               
#     output_path = os.path.join(PhenoAge_mult_regr_age, filename)
#     processed_df.to_csv(output_path, index=False)

In [1]:
# import os
# import cudf
# import cudf
# import pandas as pd
# from cuml import LinearRegression
# from tqdm import tqdm

# def substract_age(file_path):
#     data = pd.read_pickle(file_path)
#     data = cudf.DataFrame.from_pandas(data)
#     data_1 = data.copy()
    
#     data_1['Condition'] = data_1['Condition'].applymap(lambda x: 0 if x == 'HC' else 1)

#     for idx, site in tqdm(enumerate(data_1.columns[1:-2])):
#         y_train = data_1[site]
#         X_train = data_1[['Age', 'Condition']]

#         model = LinearRegression()
#         model.fit(X_train, y_train)

#         age_coef = model.coef_[0]
#         intercept = model.intercept_
#         data_1[site] = data_1[site] - (age_coef * data_1['Age'] + intercept)
        
#         # every 1000th site name for progress tracking
#         if (idx + 1) % 1000 == 0:
#             print(f'Processed site: {site}')
            
#     data_1["Condition"] = data["Condition"]
    
#     return data_1

# def process_and_save(file_path, output_folder):
#     print(f'Starting processing of dataset: {os.path.basename(file_path)}')
#     output_path = os.path.join(output_folder, os.path.basename(file_path))

#     if os.path.exists(output_path):
#         print(f'Skipping already processed file: {output_path}')
#         return
    
#     # Save
#     processed_df = substract_age(file_path)
#     processed_df = processed_df.to_pandas()
#     processed_df.to_pickle(output_path)
#     print(f'Processed and saved: {output_path}')

# if __name__ == '__main__':
#     initial_datasets = '/home/vpal/hobotnica/All_datasets/data_imputed_with_meta_for_regression_pkl'
#     substracted_age = '/home/vpal/hobotnica/All_datasets/gpu'

#     files = [os.path.join(initial_datasets, filename) for filename in os.listdir(initial_datasets) if filename.endswith('.pickle')]

#     for file_path in tqdm(files):
#         process_and_save(file_path, substracted_age)