In [1]:
import pandas as pd 
import numpy as np

In [24]:
loadings = pd.read_csv("pca_loadings_first_100_components.csv")
scores = pd.read_csv("pca_scores_first_100_components.csv")
betas = pd.read_csv("cpgs_imputed.csv")
cpg_info = pd.read_csv("methylation_coordinates.csv")

In [14]:
pcs = ['PC21', 'PC34', 'PC48', 'PC70', 'PC75']
pc_filtered = loadings[pcs]
pc_filtered.index = betas.columns
pc_filtered

Unnamed: 0,PC21,PC34,PC48,PC70,PC75
cg00036119,-0.030925,0.073188,0.069427,0.114177,-0.105159
cg00075967,-0.029608,0.043089,-0.001892,-0.024294,-0.039122
cg00079056,0.009712,0.004943,-0.004128,-0.013443,0.014534
cg00083937,-0.044440,0.016526,0.001832,-0.005026,0.042565
cg00091693,-0.008233,0.019964,0.003456,-0.056930,-0.031724
...,...,...,...,...,...
cg27628536,0.009611,0.012768,0.034609,-0.011773,-0.017238
cg27637521,0.014055,0.019473,-0.029700,0.013558,-0.017190
cg27655905,-0.007172,0.004569,-0.004003,0.003140,-0.002307
ch.13.39564907R,-0.003116,-0.001455,0.003032,0.003059,-0.002867


In [37]:
#GETTING TOP 10 CPGS FOR EACH COMPONENT IN TERMS OF LOADINGS

# Create an empty list to store the results
results = []

# Loop through each column in pc_filtered
for col in pc_filtered.columns:
    if pd.api.types.is_numeric_dtype(pc_filtered[col]):
        # Get top 10 highest values with index
        top_10 = pc_filtered[col].nlargest(10)
        for idx, value in top_10.items():
            results.append({'Column': col, 'Index': idx, 'Value': value})
        
        # Get 10 lowest values with index
        bottom_10 = pc_filtered[col].nsmallest(10)
        for idx, value in bottom_10.items():
            results.append({'Column': col, 'Index': idx, 'Value': value})

# Convert the results into a DataFrame
result_df = pd.DataFrame(results)

# Merge the DataFrames based on matching "Index" in result_df with "Name" in cpg_info
merged_df = result_df.merge(
    cpg_info[['Name', 'CHR_hg38', 'Start_hg38', 'End_hg38', 'Strand_hg38']],
    left_on='Index',
    right_on='Name',
    how='left'
)

# Drop the redundant "Name" column after the merge
merged_df = merged_df.drop(columns=['Name'])

# Display the DataFrame
merged_df.to_csv("imp_PCs_Loadings.csv", index = False)


In [34]:
# Create an empty list to store the results
results = []

# Loop through each column in pc_filtered
for col in pc_filtered.columns:
    if pd.api.types.is_numeric_dtype(pc_filtered[col]):
        # Get top 10 highest values with index
        top_10 = pc_filtered[col].nlargest(3)
        for idx, value in top_10.items():
            results.append({'Column': col, 'Index': idx, 'Value': value})
        
        # Get 10 lowest values with index
        bottom_10 = pc_filtered[col].nsmallest(3)
        for idx, value in bottom_10.items():
            results.append({'Column': col, 'Index': idx, 'Value': value})

# Convert the results into a DataFrame
result_df = pd.DataFrame(results)

# Merge the DataFrames based on matching "Index" in result_df with "Name" in cpg_info
merged_df = result_df.merge(
    cpg_info[['Name', 'CHR_hg38', 'Start_hg38', 'End_hg38', 'Strand_hg38']],
    left_on='Index',
    right_on='Name',
    how='left'
)

# Drop the redundant "Name" column after the merge
merged_df = merged_df.drop(columns=['Name'])
merged_df


Unnamed: 0,Column,Index,Value,CHR_hg38,Start_hg38,End_hg38,Strand_hg38
0,PC21,cg13499318,0.355602,chr9,135262338.0,135262340.0,-
1,PC21,cg13077366,0.198232,chr18,37328662.0,37328664.0,-
2,PC21,cg27152890,0.159408,chr19,45396982.0,45396984.0,+
3,PC21,cg26074100,-0.199226,chr8,140558612.0,140558614.0,+
4,PC21,cg01883195,-0.127117,chr6,117481237.0,117481239.0,+
5,PC21,cg02735486,-0.108151,chr4,113049778.0,113049780.0,-
6,PC34,cg17886420,0.138896,chr10,1754547.0,1754549.0,+
7,PC34,cg01637125,0.137355,chr14,77121039.0,77121041.0,-
8,PC34,cg23159337,0.112916,chr3,193554988.0,193554990.0,-
9,PC34,cg12893697,-0.204076,chr11,970388.0,970390.0,+


In [38]:
import pandas as pd
import requests

# Function to get gene annotation from Ensembl REST API
def get_gene_annotation(chromosome, start, end, strand):
    server = "https://rest.ensembl.org"
    ext = f"/overlap/region/human/{chromosome}:{int(start)}-{int(end)}?feature=gene"
    
    headers = {"Content-Type": "application/json"}
    response = requests.get(server + ext, headers=headers)
    
    if not response.ok:
        return None
    data = response.json()
    
    # Filter by strand if provided
    if strand == "+":
        data = [d for d in data if d.get("strand") == 1]
    elif strand == "-":
        data = [d for d in data if d.get("strand") == -1]

    # Return gene names or IDs
    return [d.get("external_name") for d in data if "external_name" in d]

# Annotate merged_df with gene information
gene_annotations = []
for _, row in merged_df.iterrows():
    chrom = row['CHR_hg38']
    start = row['Start_hg38']
    end = row['End_hg38']
    strand = row['Strand_hg38']
    
    genes = get_gene_annotation(chrom, start, end, strand)
    gene_annotations.append(", ".join(genes) if genes else "No Gene Found")

# Add gene annotations to the DataFrame
merged_df['Gene_Annotation'] = gene_annotations

# Save the updated DataFrame to a new CSV file
merged_df.to_csv('annotated_merged_df.csv', index=False)

# Display the updated DataFrame
print(merged_df.head())


  Column       Index     Value CHR_hg38   Start_hg38     End_hg38 Strand_hg38  \
0   PC21  cg13499318  0.355602     chr9  135262338.0  135262340.0           -   
1   PC21  cg13077366  0.198232    chr18   37328662.0   37328664.0           -   
2   PC21  cg27152890  0.159408    chr19   45396982.0   45396984.0           +   
3   PC21  cg23159337  0.148115     chr3  193554988.0  193554990.0           -   
4   PC21  cg13406893  0.123751    chr12  131084101.0  131084103.0           -   

  Gene_Annotation  
0   No Gene Found  
1           CELF4  
2   No Gene Found  
3         ATP13A4  
4   No Gene Found  


In [47]:
merged_df.iloc[0:10,:]

Unnamed: 0,Column,Index,Value,CHR_hg38,Start_hg38,End_hg38,Strand_hg38,Gene_Annotation
0,PC21,cg13499318,0.355602,chr9,135262338.0,135262340.0,-,No Gene Found
1,PC21,cg13077366,0.198232,chr18,37328662.0,37328664.0,-,CELF4
2,PC21,cg27152890,0.159408,chr19,45396982.0,45396984.0,+,No Gene Found
3,PC21,cg23159337,0.148115,chr3,193554988.0,193554990.0,-,ATP13A4
4,PC21,cg13406893,0.123751,chr12,131084101.0,131084103.0,-,No Gene Found
5,PC21,cg14371731,0.102218,chr10,79243417.0,79243419.0,+,ZMIZ1
6,PC21,cg01412762,0.098927,chr2,29529101.0,29529103.0,-,ALK
7,PC21,cg12980795,0.0959,chr5,85760728.0,85760730.0,-,No Gene Found
8,PC21,cg17310258,0.089196,chr11,31825625.0,31825627.0,-,No Gene Found
9,PC21,cg14009688,0.087866,chr7,134779490.0,134779492.0,+,CALD1


In [49]:
merged_df.to_excel("annotated_important_cpgs.xlsx", index=False)

In [51]:
merged_df[~merged_df["Gene_Annotation"].str.contains("No Gene Found", na=False)]["Gene_Annotation"].values

array(['CELF4', 'ATP13A4', 'ZMIZ1', 'ALK', 'CALD1', 'DCBLD1', 'ANK2-AS1',
       'TRAF3', 'TRIP10', 'LINC02733', 'SMPD3', 'ATP13A4', 'TRAF3',
       'CALD1', 'AP2A2', 'SMPD3', 'TNR', 'ZMIZ1', 'FLRT3', 'TRIP10',
       'ICAM5', 'CFAP74', 'TRAF3', 'SMPD3', 'SNX9', 'FHAD1', 'RBFOX1',
       'NGEF', 'P2RY6', 'KLHL35', 'FHAD1', 'CACNA1A', 'GCC1', 'LINC02733',
       'RBFOX1'], dtype=object)