In [1]:
import pandas as pd
original_df = pd.read_csv("gwas_3_scz_original_files/gwas_3_scz_supp_table_2.csv")

original_df['left_border'] = original_df['left_border'] .str.replace(' ', '', regex=False)
original_df['right_border']  = original_df['right_border'] .str.replace(' ', '', regex=False)
original_df['left_border'] = pd.to_numeric(original_df['left_border'])
original_df['right_border'] = pd.to_numeric(original_df['right_border'])

starts_with_chr = original_df[original_df["snp"].astype(str).str.startswith('chr')].reset_index()
starts_with_chr['pos_actual'] = starts_with_chr['snp'].str.extract(r'_(\d+)_')
print(starts_with_chr)


    index   Rank                snp   A12  frq_case  frq_control chr  \
0       1   57.0     chr1_8424984_D   12D    0.3190       0.3010   1   
1      13   78.0   chr1_243881945_I   12D    0.6380       0.6190   1   
2      17   62.0   chr2_146436222_I   12D    0.1760       0.1630   2   
3      18   95.0   chr2_149429178_D   12D    0.9550       0.9610   2   
4      24   10.0   chr2_200825237_I   12D    0.7410       0.7540   2   
5      33   28.0   chr3_180594593_I   I2D    0.1960       0.2080   3   
6      45  128.0   chr5_140143664_I  I12D    0.4860       0.4750   5   
7      53   53.0    chr6_84280274_D   12D    0.5240       0.5050   6   
8      55    7.0     chr7_2025096_I   DI3    0.4050       0.4230   7   
9      56  109.0    chr7_24747494_D   DI3    0.1040       0.0959   7   
10     74   13.0  chr10_104957618_I   120    0.0654       0.0756  10   
11     76   26.0   chr11_46350213_D   12D    0.8350       0.8500  11   
12    111   43.0   chr18_52749216_D   I2D    0.5890       0.5670

In [2]:
first_file_df = pd.read_csv('./gwas_3_scz_matching/result_SAD0.csv', low_memory = False)
starts_with_chr['chr'] = starts_with_chr['chr'].astype(int)
starts_with_chr['pos_actual'] = starts_with_chr['pos_actual'].astype(int)
merged_ids = pd.merge( first_file_df[['snp', 'chr', 'pos']], starts_with_chr, left_on=['chr', 'pos'], right_on = ['chr', 'pos_actual'] , how='right')
print(merged_ids)


          snp_x   chr        pos_x  index   Rank              snp_y   A12  \
0    rs34269918   1.0    8424984.0      1   57.0     chr1_8424984_D   12D   
1     rs5782266   1.0  243881945.0     13   78.0   chr1_243881945_I   12D   
2    rs56807175   2.0  146436222.0     17   62.0   chr2_146436222_I   12D   
3   rs200327371   2.0  149429178.0     18   95.0   chr2_149429178_D   12D   
4           NaN   2.0          NaN     24   10.0   chr2_200825237_I   12D   
5    rs11411529   3.0  180594593.0     33   28.0   chr3_180594593_I   I2D   
6   rs111896713   5.0  140143664.0     45  128.0   chr5_140143664_I  I12D   
7   rs540827191   6.0   84280274.0     53   53.0    chr6_84280274_D   12D   
8    rs10650434   7.0    2025096.0     55    7.0     chr7_2025096_I   DI3   
9   rs149009306   7.0   24747494.0     56  109.0    chr7_24747494_D   DI3   
10          NaN  10.0          NaN     74   13.0  chr10_104957618_I   120   
11   rs61126341  11.0   46350213.0     76   26.0   chr11_46350213_D   12D   

In [3]:
def replace_chr_with_snp_x(row, merged_ids):
    if row.startswith('chr'):
        # Lookup the corresponding 'snp_x' value in merged_ids DataFrame
        replacement = merged_ids.loc[merged_ids['snp_y'] == row, 'snp_x']
        if not replacement.empty:
            return replacement.iloc[0]
    return row

# Create a dictionary from first_file_df
p_value_dict = first_file_df.set_index('snp')['P'].to_dict()

# Map the P values to the original_df using the snp column
original_df['P'] = 0

original_df['P'] = original_df['snp'].map(p_value_dict)
print(original_df.head())

original_df['snp'] = original_df['snp'].astype(str).apply(replace_chr_with_snp_x, merged_ids=merged_ids)
original_df["P"] = pd.to_numeric(original_df["P"])
print(original_df.head())

   Rank             snp  A12  frq_case  frq_control chr  \
0  54.0       rs4648845   TC     0.533        0.527   1   
1  57.0  chr1_8424984_D  12D     0.319        0.301   1   
2  65.0       rs1498232   TC     0.311        0.296   1   
3  50.0      rs11210892   AG     0.659        0.677   1   
4  22.0      rs12129573   AC     0.377        0.358   1   

                      pos left_border_prev right_border_prev  left_border  \
0         2372401-2402501          2372401           2402501      2372401   
1     8,411,184-8,638,984        8,411,184         8,638,984      8411184   
2   30,412,551-30,437,271       30,412,551        30,437,271     30412551   
3  44,029,384-44, 128,084       44,029,384       44, 128,084     44029384   
4   73,766,426-73,991 366       73,766,426        73,991 366     73766426   

   right_border          P  
0     2402501.0  4.033e-09  
1     8638984.0        NaN  
2    30437271.0  1.284e-09  
3    44128084.0   4.97e-10  
4    73991366.0  2.346e-10  
   Rank 

In [4]:
original_df.head()

def merge_loci(df):
    # Sort the DataFrame by chromosome and left_border
    df = df.sort_values(by=['chr', 'left_border']).reset_index(drop=True)
    merged_loci = []
    current_locus = df.iloc[0].copy()  # Make a copy to avoid modifying the original DataFrame
    
    for i in range(1, len(df)):
        next_locus = df.iloc[i]
        
        if (current_locus['chr'] == next_locus['chr']) and \
        (next_locus['left_border'] <= current_locus['right_border'] + 250000):
            
            # Update borders
            current_locus['left_border'] = min(current_locus['left_border'], next_locus['left_border'])
            current_locus['right_border'] = max(current_locus['right_border'], next_locus['right_border'])
            
            # Update SNP and P value if next_locus has a lower P value
            if next_locus['P'] < current_locus['P']:
                current_locus['snp'] = next_locus['snp']
                current_locus['P'] = next_locus['P']
        
        else:
            # Append the merged locus to the list
            merged_loci.append(current_locus)
            current_locus = next_locus.copy()  # Start a new current locus
    
    # Append the last locus
    merged_loci.append(current_locus)
    
    # Convert the list of merged loci back to a DataFrame
    merged_df = pd.DataFrame(merged_loci)
    
    return merged_df

original_df_loci = merge_loci(original_df)
original_df_loci.to_csv('gwas_3_scz_intermediate_files/original_df_merging')


In [5]:
our_df = pd.read_csv('./gwas_3_scz_result_files/filtered_snps_sd=0.0.csv')
our_df.head()
print(len(our_df))

95


In [6]:
def count_overlaps(df1, df2):
        
        overlap_count = 0
    
        # Iterate over all pairs of intervals
        for _, row1 in df1.iterrows():
            chr1 = int(row1['chr'])
            for _, row2 in df2.iterrows():
                chr2 = int(row2['chr'])
                # Check if the intervals overlap on the same chromosome
                if chr1==chr2 and row1['right_border'] >= row2['left_border'] and row1['left_border'] <= row2['right_border']:
                    overlap_count += 1
                    
        return overlap_count

original_df_loci["chr"] = original_df_loci["chr"].replace("X", 23)
print(len(original_df_loci))
print(len(our_df))

num_matches = original_df_loci['snp'].isin(first_file_df['snp']).sum()
print(f"Number of potential matches: {num_matches}")

109
95
Number of potential matches: 104


In [7]:
count_overlaps(original_df_loci, our_df)

88

In [8]:
print(set(our_df['snp']).intersection(set(original_df_loci['snp'])))
print(len(set(our_df['snp']).intersection(set(original_df_loci['snp']))))

{'rs117074560', 'rs2068012', 'rs11210892', 'rs1702294', 'rs9607782', 'rs7432375', 'rs6984242', 'rs6065094', 'rs950169', 'rs7819570', 'rs12826178', 'rs1106568', 'rs10791097', 'rs12129573', 'rs2514218', 'rs10503253', 'rs75968099', 'rs6704641', 'rs4648845', 'rs2332700', 'rs11411529', 'rs13240464', 'rs1498232', 'rs4129585', 'rs3735025', 'rs77149735', 'rs36068923', 'rs73229090', 'rs77502336', 'rs11027857', 'rs3849046', 'rs2973155', 'rs12325245', 'rs6434928', 'rs16867576', 'rs2851447', 'rs6704768', 'rs7801375', 'rs6002655', 'rs2693698', 'rs7893279', 'rs4766428', 'rs59979824', 'rs4523957', 'rs2053079', 'rs11682175', 'rs12903146', 'rs11693094', 'rs8044995', 'rs2007044', 'rs12887734', 'rs7405404', 'rs2535627', 'rs10520163', 'rs12704290', 'rs34269918', 'rs9922678', 'rs12691307', 'rs75059851', 'rs8042374', 'rs6466055', 'rs1501357', 'rs56807175', 'rs4702', 'rs17194490', 'rs11139497', 'rs215411', 'rs4391122', 'rs55661361', 'rs8082590', 'rs56205728', 'rs10650434', 'rs200327371', 'rs11685299', 'rs117