***Imports:***

---



In [None]:
import os
import pandas as pd
import glob

***🧬 Merging Significant SNPs from Multiple Folders:***

---




In [None]:
# Base directory containing all phenotype folders
base_dir = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/"

# List all phenotype directories within the base directory
phenotype_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

# List to store all the individual dataframes
all_dataframes = []

# Loop through each phenotype directory
for pheno in phenotype_dirs:
    pheno_path = os.path.join(base_dir, pheno)

    # List all feature directories within the current phenotype directory
    feature_dirs = [d for d in os.listdir(pheno_path) if os.path.isdir(os.path.join(pheno_path, d))]


    # Loop through each feature directory
    for feat in feature_dirs:
        feat_path = os.path.join(pheno_path, feat)

        # Find all CSV files matching the pattern (significant SNPs per chromosome-feature)
        csv_files = glob.glob(os.path.join(feat_path, "significant_snps_chrom*_feature*.csv"))

        # Loop through the matched CSV files
        for csv_file in csv_files:
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(csv_file)
                # Add metadata columns: phenotype name, feature name, and original filename
                df["phenotype"] = pheno
                df["feature"] = feat
                df["source_file"] = os.path.basename(csv_file)

                # Append to the list of dataframes
                all_dataframes.append(df)
            except Exception as e:
                # If an error occurs during reading, print the error message
                print(f"❌ Error reading {csv_file}: {e}")

# If any dataframes were collected, concatenate and save to a combined CSV
if all_dataframes:
    final_df = pd.concat(all_dataframes, ignore_index=True)
    output_path = os.path.join(base_dir, "ALL_significant_snps_combined.csv")
    final_df.to_csv(output_path, index=False)
    print(f"✅ Combined CSV saved to: {output_path}")
else:
    # If no CSV files were found, print a warning
    print("⚠️ No CSV files found.")


✅ Combined CSV saved to: /sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/ALL_significant_snps_combined.csv


***Extract Full ID column for Haploreg:***

---



In [None]:
# Path to the combined CSV file
csv_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/ALL_significant_snps_combined.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Check if the 'ID' column exists in the DataFrame
if "ID" not in df.columns:
    raise ValueError("❌ Column 'ID' not found in the CSV file. Please check the column name.")

# Extract all non-NaN values from the 'ID' column, convert to string, and create a list
id_values = df["ID"].dropna().astype(str).tolist()
# Join all ID values into a single comma-separated string
id_string = ",".join(id_values)

# Define the output path for the text file (same directory as the CSV)
output_txt_path = os.path.join(os.path.dirname(csv_path), "significant_IDs_list.txt")
# Write the ID string to the output text file
with open(output_txt_path, "w") as f:
    f.write(id_string)
# Confirmation message
print(f"✅ ID list saved to: {output_txt_path}")


✅ ID list saved to: /sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/significant_IDs_list.txt


***Extract Unique IDs for Haploreg:***

---


In [None]:
# Path to the original text file containing comma-separated IDs
input_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/significant_IDs_list.txt"

# Path to save the cleaned file after removing duplicates
output_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_significant_IDs.txt"

# Read the entire content of the input file
with open(input_path, "r") as f:
    text = f.read()

# Split the string by commas and remove any extra whitespace from each ID
ids = [x.strip() for x in text.split(",")]

# Remove duplicate IDs using set, then sort them alphabetically
unique_ids = sorted(set(ids))

# Join the unique IDs back into a single comma-separated string
cleaned_text = ",".join(unique_ids)

# Write the cleaned string to a new output file
with open(output_path, "w") as f:
    f.write(cleaned_text)

# Print confirmation message
print("✅ Done. The cleaned file has been saved to:", output_path)


✅ סיום. הקובץ החדש שמור ב: /sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_significant_IDs.txt


***Unique IDs without IDs of type Affx:***

---



In [None]:
# Input file that contains a list of unique SNP identifiers
input_unique_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_significant_IDs.txt"

# Output file to save the filtered IDs (excluding those that start with "Affx")
output_no_affx_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_no_affx_IDs.txt"

# Read the content of the input file
with open(input_unique_path, "r") as f:
    text = f.read()

# Split the string by commas and remove extra whitespace
ids = [x.strip() for x in text.split(",")]
# Filter out any IDs that start with "Affx" (case-insensitive) and sort the remaining IDs
filtered_ids = sorted([id_ for id_ in ids if not id_.lower().startswith("affx")])

# Write the cleaned, filtered IDs to the new output file as a comma-separated string
with open(output_no_affx_path, "w") as f:
    f.write(",".join(filtered_ids))

# Print confirmation messages
print("✅ Done. The file without Affx IDs has been created and saved as:")
print(output_no_affx_path)


✅ סיום. הקובץ ללא מזהי Affx נוצר ונשמר בשם:
/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_no_affx_IDs.txt


In [None]:
# Path to the file containing the filtered, unique SNP IDs
file_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/unique_no_affx_IDs.txt"


# Read the content of the file
with open(file_path, "r") as f:
    text = f.read()

# Split the text by commas, strip whitespace, and ignore empty entries
ids = [x.strip() for x in text.split(",") if x.strip()]

# Count the number of valid IDs
num_ids = len(ids)

# Print the number of IDs
print(f"🔢 Number of IDs in the file: {num_ids}")


🔢 מספר המזהים בקובץ: 675699


In [None]:
# Path to the original CSV file containing all significant SNPs
csv_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/ALL_significant_snps_combined.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Sort the DataFrame in descending order based on the 17th column (index 16), assumed to be -LOG10(P)
df_sorted = df.sort_values(by=df.columns[16], ascending=False)

# Remove duplicate rows based on the 'ID' column (keep only the first occurrence)
df_unique = df_sorted.drop_duplicates(subset='ID')

# Filter out rows where the 'ID' starts with 'Affx' (case-insensitive)
df_filtered = df_unique[~df_unique['ID'].str.lower().str.startswith('affx')]

# Select the top 1000 rows after filtering and deduplication
top_1000 = df_filtered.head(1000)

# Save the top 1000 SNPs to a new CSV file
output_csv_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_unique_snps.csv"
top_1000.to_csv(output_csv_path, index=False)

# Extract the list of SNP IDs from the top 1000 and join into a single comma-separated string
unique_ids = top_1000['ID'].tolist()
ids_text = ",".join(unique_ids)

# Save the list of IDs to a plain text file
output_txt_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_ids.txt"
with open(output_txt_path, "w") as f:
    f.write(ids_text)

# Print success messages with paths to the output files
print("✅ Files created successfully:")
print("🔹 CSV:", output_csv_path)
print("🔹 TXT:", output_txt_path)


✅ קבצים נוצרו בהצלחה:
🔹 CSV: /sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_unique_snps.csv
🔹 TXT: /sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_ids.txt


***Extract top 1000 IDs:***

---



In [None]:
# Path to the input CSV file containing the top 1000 SNPs
input_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_unique_snps.csv"

# Path to the output text file where the list of IDs will be saved
output_txt_path = "/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_ids.txt"

# Read the CSV file into a DataFrame
df = pd.read_csv(input_path)

# Extract the 'ID' column as a list (assuming no duplicates need to be removed again)
unique_ids = df['ID'].tolist()

# Join the IDs into a single comma-separated string
ids_text = ",".join(unique_ids)

# Print confirmation message
with open(output_txt_path, "w") as f:
    f.write(ids_text)

print("✅ הקובץ עם רשימת ה־IDs נוצר בהצלחה:")
print(output_txt_path)

✅ הקובץ עם רשימת ה־IDs נוצר בהצלחה:
/sise/nadav-group/nadavrap-group/ECGs/Final Project/P-VALUE/Manhattan_plots_without_norm/top_1000_ids.txt
