In [80]:
import copy
import csv
import ast
import pandas as pd
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import chi2
import requests
import json
import re
from ast import literal_eval

In [81]:
# read in the results text file to a dictionary
def read_results(results_fp):
    with open(results_fp, 'r') as f:
        data = f.read()
    # reconstructing the data as a dictionary
    tmp = ast.literal_eval(data)
    # removing the get_URLs
    control_dict = remove_D_get(tmp)
    return(control_dict)

# remove the get_URL from the dictionary
def remove_D_get(data):
    new_dict = copy.deepcopy(data)
    for pid in new_dict:
        for drug in new_dict[pid]['drugs']:
            new_dict[pid]['drugs'][drug].pop('get_URL', None)
    return new_dict

# reformat the dictionary to be a CSV file with each drug-reaction pair as a row
def reformat_to_csv(data):
    # open the file to write to
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/results.csv", "w") as myfile:
        writer = csv.writer(myfile, delimiter='$', quoting=csv.QUOTE_NONE)
        
        # add header row
        header = ['Report_ID', 'Group', 'Drug_name','D_number', 'D_group', 'D_target', 'D_pathway', 'Reaction']
        writer.writerow(header)

        # make a row for each drug-reaction pair by adding relevant information to a list
        for pid in data:
            # iterate through each drug
            for drug in data[pid]['drugs']:
                # iterate through each reaction and write a row for each
                for reaction in data[pid]['reactions_MedDRA']:
                    row = []
                    row.append(pid)
                    row.append(data[pid]['group'])
                    row.append(drug)
                    # if D_number is empty, append "NA"
                    if not data[pid]['drugs'][drug]['D_number']:
                        row.append("NA")
                        row.append("NA")
                        row.append("NA")
                        row.append("NA")
                        row.append(reaction)
                        writer.writerow(row)
                    else :
                        row.append(data[pid]['drugs'][drug]['D_number'])
                        # if classes row is "set()", append "NA"
                        if str(data[pid]['drugs'][drug]['Classes']) == "set()":
                            row.append("NA")
                        else : 
                            temp = data[pid]['drugs'][drug]['Classes']
                            # convert the set to a list
                            temp = list(temp)
                            row.append(temp)
                        # if target list is empty, append "NA"
                        if str(data[pid]['drugs'][drug]['Target']) == "[]":
                            row.append("NA")
                        else :
                            row.append(data[pid]['drugs'][drug]['Target'])                            
                        # if pathway list is empty, append "NA"
                        if str(data[pid]['drugs'][drug]['Pathway']) == "[]":
                            row.append("NA")
                        else :
                            row.append(data[pid]['drugs'][drug]['Pathway'])
                        row.append(reaction)
                        writer.writerow(row)


In [82]:
#data_fp = '/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/results_dictionary.txt'
#results_dict = read_results(data_fp)
#reformat_to_csv(results_dict)

In [83]:
# function to create contingency table for each drug-reaction pair
def drug_contingency():
    
    '''
    [A] = The number of report_IDs (in the given group) containing the drug of interest and containing the adverse event of interest
    [B] = The number of report_IDs (in the given group) containing the drug of interest, but not containing the  adverse event of interest
    [C] = The number of report_IDs (in the given group) containing not containing the drug of interest, but containing the adverse event of interest
    [D] = The number of report_IDs (in the given group) containing neither the drug of interest nor the adverse event of interest
    '''
    
    # Open the input file and create a CSV reader
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/results.csv", "r") as infile:
        reader = csv.reader(infile, delimiter="$")
        next(reader)  # Skip header

        # Create nested defaultdicts to store the data and report counts
        data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        group_report_counts = defaultdict(int)
        group_drug_report_counts = defaultdict(lambda: defaultdict(int))
        group_reaction_report_counts = defaultdict(lambda: defaultdict(int))

        # Iterate through the rows in the input file
        for row in reader:
            report_id, group, drug_name, d_number, D_group, D_target, D_pathway, reaction = row
            # Increment the count for the current combination of group, drug, and reaction
            data[group][(drug_name, d_number, D_group, reaction)]["A"] += 1
            group_report_counts[group] += 1
            group_drug_report_counts[group][(drug_name, d_number)] += 1
            group_reaction_report_counts[group][reaction] += 1

    # Open the output file and create a CSV writer
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_contingency.csv", "w") as outfile:
        writer = csv.writer(outfile, delimiter="$")
        # Write the header row
        writer.writerow(["Group", "Drug_name", "D_number", "D_Group", "Reaction", "A", "B", "C", "D"])

        # Iterate through the groups in the data
        for group in data:

            # Iterate through the drug_name, d_number, and reaction combinations in the group
            for (drug_name, d_number, D_group, reaction), counts in data[group].items():
                a = counts["A"]  # The number of reports with the drug and reaction
                b = group_drug_report_counts[group][(drug_name, d_number)] - a  # The number of reports with the drug but not the reaction
                c = group_reaction_report_counts[group][reaction] - a  # The number of reports without the drug but with the reaction
                d = group_report_counts[group] - a - b - c  # The number of reports without the drug and reaction

                # writer.writerow([group, drug_name, d_number, D_group, reaction, a, b, c, d])

                # Write the row if A, B, C, and D are all greater than 3
                if a > 3 and b > 3 and c > 3 and d > 3:
                    writer.writerow([group, drug_name, d_number, D_group, reaction, a, b, c, d])
                #else:
                #    writer.writerow([group, drug_name, d_number, D_group, reaction, a, b, c, d])

drug_contingency()

In [84]:
# function to create contingency table for drug group-reaction pair
def dg_contingency():
    # Open the input file and create a CSV reader
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/results.csv", "r") as infile:
        reader = csv.reader(infile, delimiter="$")
        next(reader)  # Skip header

        # Create nested defaultdicts to store the data and report counts
        data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        group_report_counts = defaultdict(int)
        group_d_group_report_counts = defaultdict(lambda: defaultdict(int))
        group_reaction_report_counts = defaultdict(lambda: defaultdict(int))

        # Iterate through the rows in the input file
        for row in reader:
            report_id, group, _, _, d_group_str, _, _, reaction = row

            try:
                # Convert the string representation of the d_group list to a list, 
                d_groups = ast.literal_eval(d_group_str)
            
                for d_group_tuple in d_groups:
                    d_group = str(d_group_tuple)
                    # Increment the count for the current combination of group, d_group, and reaction
                    data[group][(d_group, reaction)]["A"] += 1
                    group_report_counts[group] += 1
                    group_d_group_report_counts[group][d_group] += 1
                    group_reaction_report_counts[group][reaction] += 1
            except ValueError:
                # print(f"Error parsing d_group string: {d_group_str}")
                continue
            
    # Open the output file and create a CSV writer
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_contingency.csv", "w") as outfile:
        writer = csv.writer(outfile, delimiter="$")
        # Write the header row
        writer.writerow(["Group", "D_group", "Reaction", "A", "B", "C", "D"])

        # Iterate through the groups in the data
        for group in data:
            # Iterate through the d_group and reaction combinations in the group
            for (d_group, reaction), counts in data[group].items():
                a = counts["A"]  # The number of reports with the d_group and reaction
                b = group_d_group_report_counts[group][d_group] - a  # The number of reports with the d_group but not the reaction
                c = group_reaction_report_counts[group][reaction] - a  # The number of reports without the d_group but with the reaction
                d = group_report_counts[group] - a - b - c  # The number of reports without the d_group and reaction

                # writer.writerow([group, d_group, reaction, a, b, c, d])

                # Write the row if A, B, C, and D are all greater than 3
                if a > 3 and b > 3 and c > 3 and d > 3:
                    writer.writerow([group, d_group, reaction, a, b, c, d])
                #else:
                #    writer.writerow([group, d_group, reaction, "null","null" ,"null" ,"null" ])


dg_contingency()

In [85]:
# function to create contingency table for each drug pathway-reaction pair
def dp_contingency():
    # Open the input file and create a CSV reader
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/results.csv", "r") as infile:
        reader = csv.reader(infile, delimiter="$")
        next(reader)  # Skip header

        # Create nested defaultdicts to store the data and report counts
        data = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        group_report_counts = defaultdict(int)
        group_d_pathway_report_counts = defaultdict(lambda: defaultdict(int))
        group_reaction_report_counts = defaultdict(lambda: defaultdict(int))

        # Iterate through the rows in the input file
        for row in reader:
            report_id, group, _, _, _, _, d_pathway_str, reaction = row

            try:
                # Convert the string representation of the d_pathway list to a list
                d_pathways = ast.literal_eval(d_pathway_str)

                for d_pathway in d_pathways:
                    # Increment the count for the current combination of group, d_pathway, and reaction
                    data[group][(d_pathway, reaction)]["A"] += 1
                    group_report_counts[group] += 1
                    group_d_pathway_report_counts[group][d_pathway] += 1
                    group_reaction_report_counts[group][reaction] += 1
            except ValueError:
                # print(f"Error parsing d_pathway string: {d_pathway_str}")
                continue

    # Open the output file and create a CSV writer
    with open("/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_contingency.csv", "w") as outfile:
        writer = csv.writer(outfile, delimiter="$")
        # Write the header row
        writer.writerow(["Group", "D_pathway", "Reaction", "A", "B", "C", "D"])

        # Iterate through the groups in the data
        for group in data:
            # Iterate through the d_pathway and reaction combinations in the group
            for (d_pathway, reaction), counts in data[group].items():
                a = counts["A"]  # The number of reports with the d_pathway and reaction
                b = group_d_pathway_report_counts[group][d_pathway] - a  # The number of reports with the d_pathway but not the reaction
                c = group_reaction_report_counts[group][reaction] - a  # The number of reports without the d_pathway but with the reaction
                d = group_report_counts[group] - a - b - c  # The number of reports without the d_pathway and reaction

                # writer.writerow([group, d_pathway, reaction, a, b, c, d])

                # Write the row if A, B, C, and D are all greater than 3
                if a > 3 and b > 3 and c > 3 and d > 3:
                    writer.writerow([group, d_pathway, reaction, a, b, c, d])
                #else:
                #    writer.writerow([group, d_pathway, reaction, "null","null" ,"null" ,"null" ])


dp_contingency()

In [86]:
# perform drug disproportionality analysis and write the results to a new file called drug_disproportionality.csv
def drug_disproportionality_analysis():    
    '''
    [A] = The number of report_IDs (in the given group) containing the drug of interest and containing the adverse event of interest
    [B] = The number of report_IDs (in the given group) containing the drug of interest, but not containing the  adverse event of interest
    [C] = The number of report_IDs (in the given group) not containing the drug of interest, but containing the adverse event of interest
    [D] = The number of report_IDs (in the given group) containing neither the drug of interest nor the adverse event of interest
    '''
    with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_contingency.csv', 'r') as file:
        reader = csv.reader(file, delimiter='$', quoting=csv.QUOTE_NONE)
        with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_disproportionality.csv', 'w') as outfile:
            writer = csv.writer(outfile, delimiter='$', quoting=csv.QUOTE_NONE)
            # write header row
            writer.writerow(['Group', 'Drug', 'D_number', 'D_group', 'Reaction', 'N_observed', 'PRR'])
            # skip the header row in the reader
            next(reader)
            # iterate through each row in contingency.csv
            for row in reader:
                # PRR = (A / (A+B)) / (C/ (C+D))
                A, B, C, D = int(row[5]), int(row[6]), int(row[7]), int(row[8])

                # calculate the PRR if A, B, C, and D are all greater than or equal to 3
                if A >= 3 and B >= 3 and C >= 3 and D >= 3:
                    PRR = (A/(A+B)) / (C/(C+D))
                    # write the row to the file
                    writer.writerow([row[0], row[1], row[2], row[3], row[4], A, PRR])
                # else, write the row to the file with a PRR = nan
                #else:
                #    writer.writerow([row[0], row[1], row[2], row[3], row[4], A, 'nan'])

# perform dg disproportionality analysis and write the results to a new file called dg_disproportionality.csv
def dg_disproportionality_analysis():
    with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_contingency.csv', 'r') as file:
        reader = csv.reader(file, delimiter='$', quoting=csv.QUOTE_NONE)
        with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_disproportionality.csv', 'w') as outfile:
            writer = csv.writer(outfile, delimiter='$', quoting=csv.QUOTE_NONE)
            # write header row
            writer.writerow(['Group', 'KEGG_Dgroup', 'Reaction', 'N_observed', 'PRR'])
            # skip the header row in the reader
            next(reader)
            # iterate through each row in contingency.csv
            for row in reader:
                
                A, B, C, D = int(row[3]), int(row[4]), int(row[5]), int(row[6])
                
                # calculate the PRR if A, B, C, and D are all greater than or equal to 3
                if A >= 3 and B >= 3 and C >= 3 and D >= 3:
                    PRR = (A/(A+B)) / (C/(C+D))
                    # write the row to the file
                    writer.writerow([row[0], row[1], row[2], A, PRR])
                # else, write the row to the file with PRR = nan
                #else:
                #    writer.writerow([row[0], row[1], row[2], A, 'nan'])


drug_disproportionality_analysis()
dg_disproportionality_analysis()

In [87]:
# perform dp disproportionality analysis and write the results to a new file called dp_disproportionality.csv
def dp_disproportionality_analysis():
    with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_contingency.csv', 'r') as file:
        reader = csv.reader(file, delimiter='$', quoting=csv.QUOTE_NONE)
        with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_disproportionality.csv', 'w') as outfile:
            writer = csv.writer(outfile, delimiter='$', quoting=csv.QUOTE_NONE)
            # write header row
            writer.writerow(['Group', 'D_pathway', 'Reaction', 'N_observed', 'PRR'])
            # skip the header row in the reader
            next(reader)
            # iterate through each row in contingency.csv
            for row in reader:

                A, B, C, D = int(row[3]), int(row[4]), int(row[5]), int(row[6])

                # calculate the PRR if A, B, C, and D are all greater than or equal to 3
                if A >= 3 and B >= 3 and C >= 3 and D >= 3:
                    PRR = (A/(A+B)) / (C/(C+D))
                    # write the row to the file
                    writer.writerow([row[0], row[1], row[2], A, PRR])
                # else, write the row to the file with a PRR = nan
                #else:
                #    writer.writerow([row[0], row[1], row[2], A, 'nan'])

dp_disproportionality_analysis()

In [88]:
# add a chi-squared (with Yates' correction) and p-value column to the drug_disproportionality.csv file
def drug_chi_squared_yates(a, b, c, d):
    row_total1 = a + b
    row_total2 = c + d
    col_total1 = a + c
    col_total2 = b + d
    total_obs = a + b + c + d

    expected_a = (row_total1 * col_total1) / total_obs
    expected_b = (row_total1 * col_total2) / total_obs
    expected_c = (row_total2 * col_total1) / total_obs
    expected_d = (row_total2 * col_total2) / total_obs

    # apply chi-squared with Yates' correction
    chi_squared = (np.abs(a - expected_a) - 0.5) ** 2 / expected_a + \
                  (np.abs(b - expected_b) - 0.5) ** 2 / expected_b + \
                  (np.abs(c - expected_c) - 0.5) ** 2 / expected_c + \
                  (np.abs(d - expected_d) - 0.5) ** 2 / expected_d
    
    deg_freedom = 1  # degrees of freedom
    p_value = chi2.sf(chi_squared, deg_freedom)  # survival function (1 - CDF)
                  
    return chi_squared, p_value


# Read the CSV files, separator = $
contingency_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_contingency.csv', sep='$')
disproportionality_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_disproportionality.csv', sep='$')

# Calculate chi-squared and p-values for rows with PRR != nan
chi_squared_values = []
p_values = []
for _, row in contingency_df.iterrows():
    #if np.isnan(row['PRR']):
    #    chi_squared_values.append('nan')
    #    p_values.append('nan')
    #    continue
    chi_squared, p_value = drug_chi_squared_yates(row['A'], row['B'], row['C'], row['D'])
    chi_squared_values.append(chi_squared)
    p_values.append(p_value)

# Add chi-squared and p-values to the disproportionality_df
disproportionality_df['Chi_squared'] = chi_squared_values
disproportionality_df['P_value'] = p_values

# drop rows with PRR = "nan"
# disproportionality_df = disproportionality_df[disproportionality_df['PRR'] != 'nan']

# Save the updated disproportionality_df to a csv file, replacing the old one
disproportionality_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_disproportionality.csv', sep='$', index=False)

In [89]:
# add a chi-squared (with Yates' correction) column to the dg_disproportionality.csv file
def dg_chi_squared_yates(a, b, c, d):
    row_total1 = a + b
    row_total2 = c + d
    col_total1 = a + c
    col_total2 = b + d
    total_obs = a + b + c + d

    expected_a = (row_total1 * col_total1) / total_obs
    expected_b = (row_total1 * col_total2) / total_obs
    expected_c = (row_total2 * col_total1) / total_obs
    expected_d = (row_total2 * col_total2) / total_obs

    # apply chi-squared with Yates' correction
    chi_squared = (np.abs(a - expected_a) - 0.5) ** 2 / expected_a + \
                  (np.abs(b - expected_b) - 0.5) ** 2 / expected_b + \
                  (np.abs(c - expected_c) - 0.5) ** 2 / expected_c + \
                  (np.abs(d - expected_d) - 0.5) ** 2 / expected_d

    deg_freedom = 1  # degrees of freedom
    p_value = chi2.sf(chi_squared, deg_freedom)  # survival function (1 - CDF)              
    
    return chi_squared, p_value

# Read the CSV files, separator = $
contingency_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_contingency.csv', sep='$')
disproportionality_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_disproportionality.csv', sep='$')

# Calculate chi-squared and p-values for rows with PRR != nan
chi_squared_values = []
p_values = []
for _, row in contingency_df.iterrows():
    #if np.isnan(row['PRR']):
    #    chi_squared_values.append('nan')
    #    p_values.append('nan')
    #    continue
    chi_squared, p_value = dg_chi_squared_yates(row['A'], row['B'], row['C'], row['D'])
    chi_squared_values.append(chi_squared)
    p_values.append(p_value)

# Add chi-squared and p-values to the disproportionality_df
disproportionality_df['Chi_squared'] = chi_squared_values
disproportionality_df['P_value'] = p_values

# drop rows with PRR = "nan"
# disproportionality_df = disproportionality_df[disproportionality_df['PRR'] != 'nan']

# Save the updated disproportionality_df, replacing the old dg_disproportionality.csv file
disproportionality_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_disproportionality.csv', sep='$', index=False)

In [90]:
# add a chi-squared (with Yates' correction) column to the dp_disproportionality.csv file
def dp_chi_squared_yates(a, b, c, d):
    row_total1 = a + b
    row_total2 = c + d
    col_total1 = a + c
    col_total2 = b + d
    total_obs = a + b + c + d

    expected_a = (row_total1 * col_total1) / total_obs
    expected_b = (row_total1 * col_total2) / total_obs
    expected_c = (row_total2 * col_total1) / total_obs
    expected_d = (row_total2 * col_total2) / total_obs

    # apply chi-squared with Yates' correction
    chi_squared = (np.abs(a - expected_a) - 0.5) ** 2 / expected_a + \
                  (np.abs(b - expected_b) - 0.5) ** 2 / expected_b + \
                  (np.abs(c - expected_c) - 0.5) ** 2 / expected_c + \
                  (np.abs(d - expected_d) - 0.5) ** 2 / expected_d

    deg_freedom = 1  # degrees of freedom
    p_value = chi2.sf(chi_squared, deg_freedom)  # survival function (1 - CDF)              
    
    return chi_squared, p_value

# Read the CSV files, separator = $
contingency_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_contingency.csv', sep='$')
disproportionality_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_disproportionality.csv', sep='$')

# Calculate chi-squared and p-values for rows with PRR != nan
chi_squared_values = []
p_values = []
for _, row in contingency_df.iterrows():
    #if np.isnan(row['PRR']):
    #    chi_squared_values.append('nan')
    #    p_values.append('nan')
    #    continue
    chi_squared, p_value = dp_chi_squared_yates(row['A'], row['B'], row['C'], row['D'])
    chi_squared_values.append(chi_squared)
    p_values.append(p_value)

# Add chi-squared and p-values to the disproportionality_df
disproportionality_df['Chi_squared'] = chi_squared_values
disproportionality_df['P_value'] = p_values

# drop rows with PRR = "nan"
# disproportionality_df = disproportionality_df[disproportionality_df['PRR'] != 'nan']

# Save the updated disproportionality_df, replacing the old dp_disproportionality.csv file
disproportionality_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_disproportionality.csv', sep='$', index=False)

In [91]:
# compare the PRRs of the two groups in drug_disproportionality and write the results to a new file called drug_comparison.csv
def compare_drug_PRRs():
    # read drug_disproportionality.csv into a pandas dataframe
    disp_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_disproportionality.csv', delimiter='$')
    
    # create a new dataframe with only the rows from group 0 and remove the group column
    group_0 = disp_df.loc[disp_df['Group'] == 0].drop('Group', axis=1)
    # create a new dataframe with only the rows from group 1 and remove the group column
    group_1 = disp_df.loc[disp_df['Group'] == 1].drop('Group', axis=1)
    
    # sort the rows in each dataframe by drug name, D_number, D_group, and reaction
    group_0 = group_0.sort_values(by=['Drug', 'D_number', 'D_group', 'Reaction'])
    group_1 = group_1.sort_values(by=['Drug', 'D_number', 'D_group', 'Reaction'])


    # merge the two dataframes on drug name and reaction for each drug reaction pair in group 1
    merged_df = pd.merge(group_0, group_1, on=['Drug', 'Reaction', 'D_number'], how='inner')

    # rename the P_value_x column to P_value_0 and the P_value_y column to P_value_1
    merged_df = merged_df.rename(columns={'P_value_x': 'P_value_0', 'P_value_y': 'P_value_1'})
    
    # rename the N_observed_y column to N_observed_hw and the N_observed_x column to N_observed_control
    merged_df = merged_df.rename(columns={'N_observed_y': 'N_observed_hw', 'N_observed_x': 'N_observed_control'})

    # add a column for the quotient of PRRs: PRRR = PRR_y / PRR_x
    merged_df['PRRR'] = merged_df['PRR_y'] / merged_df['PRR_x']

    # rename the PRR_x column to PRR_0 and the PRR_y column to PRR_1
    merged_df = merged_df.rename(columns={'PRR_x': 'PRR_0', 'PRR_y': 'PRR_1'})
    
    # sort the dataframe by PRRR
    merged_df = merged_df.sort_values(by=['PRRR'], ascending=False)

    # rename the Chi_squared_x column to Chi_squared_0 and the Chi_squared_y column to Chi_squared_1
    merged_df = merged_df.rename(columns={'Chi_squared_x': 'Chi_squared_0', 'Chi_squared_y': 'Chi_squared_1'})


    # round PRR_0, PRR_1, PRRR, Chi_squared_0, and Chi_squared_1, P_value_0, and P_value_1
    merged_df['PRR_0'] = merged_df['PRR_0'].round(3)
    merged_df['PRR_1'] = merged_df['PRR_1'].round(3)
    merged_df['PRRR'] = merged_df['PRRR'].round(3)
    merged_df['Chi_squared_0'] = merged_df['Chi_squared_0'].round(3)
    merged_df['Chi_squared_1'] = merged_df['Chi_squared_1'].round(3)
    #merged_df['P_value_0'] = merged_df['P_value_0'].round(5)
    #merged_df['P_value_1'] = merged_df['P_value_1'].round(5)


    # drop the column D_group_x
    merged_df = merged_df.drop('D_group_x', axis=1)
    # rename the D_group_y column to D_group
    merged_df = merged_df.rename(columns={'D_group_y': 'D_group'})

    merged_df = merged_df[['D_number', 'D_group', 'Drug', 'Reaction', 'N_observed_control', 'N_observed_hw', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1', 'PRRR']]


    # write the merged dataframe to a new file called comparison.csv
    merged_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison.csv', sep='$', index=False)

compare_drug_PRRs()


In [92]:
# compare the PRRs of the two groups in dg_disproportionality and write the results to a new file called dg_comparison.csv
def compare_dg_PRRs():
    # read dg_disproportionality.csv into a pandas dataframe
    dg_disp_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_disproportionality.csv', delimiter='$')
    
    
    # create a new dataframe with only the rows from group 0 and remove the group column
    group_0 = dg_disp_df.loc[dg_disp_df['Group'] == 0].drop('Group', axis=1)
    # create a new dataframe with only the rows from group 1 and remove the group column
    group_1 = dg_disp_df.loc[dg_disp_df['Group'] == 1].drop('Group', axis=1)

    # sort the rows in each dataframe by drug group and reaction
    group_0 = group_0.sort_values(by=['KEGG_Dgroup', 'Reaction'])
    group_1 = group_1.sort_values(by=['KEGG_Dgroup', 'Reaction'])

    # merge the two dataframes on drug group and reaction for each drug reaction pair in group 1
    merged_df = pd.merge(group_0, group_1, on=['KEGG_Dgroup', 'Reaction'], how='inner')

    # rename the P_value_x column to P_value_0 and the P_value_y column to P_value_1
    merged_df = merged_df.rename(columns={'P_value_x': 'P_value_0', 'P_value_y': 'P_value_1'})
    # rename the N_observed_y column to N_observed_hw and the N_observed_x column to N_observed_control
    merged_df = merged_df.rename(columns={'N_observed_y': 'N_observed_hw', 'N_observed_x': 'N_observed_control'})

    # add a column for the quotient of PRRs: PRRR = PRR_y / PRR_x
    merged_df['PRRR'] = merged_df['PRR_y'] / merged_df['PRR_x']

    # rename the PRR_x column to PRR_0 and the PRR_y column to PRR_1
    merged_df = merged_df.rename(columns={'PRR_x': 'PRR_0', 'PRR_y': 'PRR_1'})
    # rename the Chi_squared_x column to Chi_squared_0 and the Chi_squared_y column to Chi_squared_1
    merged_df = merged_df.rename(columns={'Chi_squared_x': 'Chi_squared_0', 'Chi_squared_y': 'Chi_squared_1'})
    
    # round PRR_0, PRR_1, PRRR, Chi_squared_0, Chi_squared_1, P_value_0, and P_value_1
    merged_df['PRR_0'] = merged_df['PRR_0'].round(3)
    merged_df['PRR_1'] = merged_df['PRR_1'].round(3)
    merged_df['PRRR'] = merged_df['PRRR'].round(3)
    merged_df['Chi_squared_0'] = merged_df['Chi_squared_0'].round(3)
    merged_df['Chi_squared_1'] = merged_df['Chi_squared_1'].round(3)
    #merged_df['P_value_0'] = merged_df['P_value_0'].round(5)
    #merged_df['P_value_1'] = merged_df['P_value_1'].round(5)
    
    # sort the dataframe by PRRR
    merged_df = merged_df.sort_values(by=['PRRR'], ascending=False)

    # reorder the columns
    merged_df = merged_df[['KEGG_Dgroup', 'Reaction', 'N_observed_control', 'N_observed_hw', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1', 'PRRR']]

    # write the merged dataframe to a new file called dg_comparison.csv
    merged_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison.csv', sep='$', index=False)

compare_dg_PRRs()

In [93]:
# compare the PRRs of the two groups in dp_disproportionality and write the results to a new file called dp_comparison.csv
def compare_dp_PRRs():
    # read dg_disproportionality.csv into a pandas dataframe
    dp_disp_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_disproportionality.csv', delimiter='$')

    # create a new dataframe with only the rows from group 0 and remove the group column
    group_0 = dp_disp_df.loc[dp_disp_df['Group'] == 0].drop('Group', axis=1)
    # create a new dataframe with only the rows from group 1 and remove the group column
    group_1 = dp_disp_df.loc[dp_disp_df['Group'] == 1].drop('Group', axis=1)

    # sort the rows in each dataframe by drug pathway and reaction
    group_0 = group_0.sort_values(by=['D_pathway', 'Reaction'])
    group_1 = group_1.sort_values(by=['D_pathway', 'Reaction'])

    # merge the two dataframes on drug pathway and reaction for each pair in group 1
    merged_df = pd.merge(group_0, group_1, on=['D_pathway', 'Reaction'], how='inner')

    # rename the P_value_x column to P_value_0 and the P_value_y column to P_value_1
    merged_df = merged_df.rename(columns={'P_value_x': 'P_value_0', 'P_value_y': 'P_value_1'})
    # rename the N_observed_y column to N_observed_hw and the N_observed_x column to N_observed_control
    merged_df = merged_df.rename(columns={'N_observed_y': 'N_observed_hw', 'N_observed_x': 'N_observed_control'})

    # add a column for the quotient of PRRs: PRRR = PRR_y / PRR_x
    merged_df['PRRR'] = merged_df['PRR_y'] / merged_df['PRR_x']

    # rename the PRR_x column to PRR_0 and the PRR_y column to PRR_1
    merged_df = merged_df.rename(columns={'PRR_x': 'PRR_0', 'PRR_y': 'PRR_1'})
    # rename the Chi_squared_x column to Chi_squared_0 and the Chi_squared_y column to Chi_squared_1
    merged_df = merged_df.rename(columns={'Chi_squared_x': 'Chi_squared_0', 'Chi_squared_y': 'Chi_squared_1'})

    # round PRR_0, PRR_1, PRRR, Chi_squared_0, Chi_squared_1, P_value_0, and P_value_1
    merged_df['PRR_0'] = merged_df['PRR_0'].round(3)
    merged_df['PRR_1'] = merged_df['PRR_1'].round(3)
    merged_df['PRRR'] = merged_df['PRRR'].round(3)
    merged_df['Chi_squared_0'] = merged_df['Chi_squared_0'].round(3)
    merged_df['Chi_squared_1'] = merged_df['Chi_squared_1'].round(3)
    #merged_df['P_value_0'] = merged_df['P_value_0'].round(5)
    #merged_df['P_value_1'] = merged_df['P_value_1'].round(5)

    # sort the dataframe by PRRR
    merged_df = merged_df.sort_values(by=['PRRR'], ascending=False)

    # reorder the columns
    merged_df = merged_df[['D_pathway', 'Reaction', 'N_observed_control', 'N_observed_hw', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1', 'PRRR']]

    # write the merged dataframe to a new file called dp_comparison.csv
    merged_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison.csv', sep='$', index=False)

compare_dp_PRRs()

In [94]:
def create_final_comparisons():
    # read drug_comparison.csv into a pandas dataframe
    drug_comparison_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison.csv', delimiter='$')
    # read dg_comparison.csv into a pandas dataframe
    dg_comparison_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison.csv', delimiter='$')
    # read dp_comparison.csv into a pandas dataframe
    dp_comparison_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison.csv', delimiter='$')

    # drop rows where PRR_1 is less than 2
    drug_comparison_df = drug_comparison_df.loc[drug_comparison_df['PRR_1'] >= 2]
    dg_comparison_df = dg_comparison_df.loc[dg_comparison_df['PRR_1'] >= 2]
    dp_comparison_df = dp_comparison_df.loc[dp_comparison_df['PRR_1'] >= 2]

    # drop rows where PRRR is less than 2
    drug_comparison_df = drug_comparison_df.loc[drug_comparison_df['PRRR'] >= 2]
    dg_comparison_df = dg_comparison_df.loc[dg_comparison_df['PRRR'] >= 2]
    dp_comparison_df = dp_comparison_df.loc[dp_comparison_df['PRRR'] >= 2]

    # drop rows where Chi_squared_1 is less than 4
    drug_comparison_df = drug_comparison_df.loc[drug_comparison_df['Chi_squared_1'] >= 4]
    dg_comparison_df = dg_comparison_df.loc[dg_comparison_df['Chi_squared_1'] >= 4]
    dp_comparison_df = dp_comparison_df.loc[dp_comparison_df['Chi_squared_1'] >= 4]

    # replace all commas in all columns with semicolons
    drug_comparison_df = drug_comparison_df.replace(',', ';', regex=True)
    dg_comparison_df = dg_comparison_df.replace(',', ';', regex=True)
    dp_comparison_df = dp_comparison_df.replace(',', ';', regex=True)

    # drop the columns N_observed_control
    drug_comparison_df = drug_comparison_df.drop(columns=['N_observed_control'])
    dg_comparison_df = dg_comparison_df.drop(columns=['N_observed_control'])
    dp_comparison_df = dp_comparison_df.drop(columns=['N_observed_control'])
    
    # write the dataframes to new files
    drug_comparison_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_formatted.csv', sep='$', index=False)
    dg_comparison_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_formatted.csv', sep='$', index=False)
    dp_comparison_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison_formatted.csv', sep='$', index=False)

create_final_comparisons()

In [95]:
# Function to get the DG_name from KEGG API
def get_dg_name(dg_id):
    url = f"https://rest.kegg.jp/get/dg:{dg_id}"
    response = requests.get(url)

    if response.status_code == 200:
        for line in response.text.split('\n'):
            if line.startswith("NAME"):
                return line.split("NAME")[1].strip()
    return ""

def add_names():
    # Input and output file names
    input_file = "/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_formatted.csv"
    output_file = "/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_final.csv"

    # Dictionary to store DGroup-DG_name pairs
    dg_name_dict = {}

    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_reader = csv.reader(infile, delimiter='$')
        csv_writer = csv.writer(outfile, delimiter='$')
        
        # Add the new column header (assuming there's a header row)
        header = next(csv_reader)
        header.append("DG_name")
        csv_writer.writerow(header)
        
        rowcount = 0
        
        for row in csv_reader:
            
            if rowcount % 100 == 0:
                print(f"Processed {rowcount} rows")
            rowcount += 1

            dg_id = row[0].split(";")[0].replace("(", "").replace("'", "")
            
            if dg_id not in dg_name_dict:
                dg_name = get_dg_name(dg_id)
                # print(f"DG_name for {dg_id} is {dg_name}")
                dg_name_dict[dg_id] = dg_name
                # replace commas with semicolons
                dg_name = dg_name.replace(",", ";")
            else:
                dg_name = dg_name_dict[dg_id]
                # print("Found in cache")
            
            row.append(dg_name)
            csv_writer.writerow(row)

add_names()


Processed 0 rows
Processed 100 rows
Processed 200 rows
Processed 300 rows
Processed 400 rows
Processed 500 rows
Processed 600 rows
Processed 700 rows
Processed 800 rows
Processed 900 rows
Processed 1000 rows
Processed 1100 rows
Processed 1200 rows
Processed 1300 rows
Processed 1400 rows
Processed 1500 rows
Processed 1600 rows
Processed 1700 rows
Processed 1800 rows
Processed 1900 rows
Processed 2000 rows
Processed 2100 rows
Processed 2200 rows
Processed 2300 rows


In [96]:
# read dg_comparison_final.csv into a dataframe
dg_comparison_final_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_final.csv', sep='$', header=0)
# columns: KEGG_Dgroup$Reaction$N_observed_hw$Chi_squared_0$Chi_squared_1$P_value_0$P_value_1$PRR_0$PRR_1$PRRR$DG_name
# reorder columns: KEGG_Dgroup,DG_name,Reaction,N_observed_hw,Chi_squared_0,Chi_squared_1,P_value_0,P_value_1,PRR_0,PRR_1,PRRR
dg_comparison_final_df = dg_comparison_final_df[['KEGG_Dgroup', 'DG_name', 'Reaction', 'N_observed_hw', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1', 'PRRR']]
# write the dataframe to a csv file, sep = '$'
dg_comparison_final_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_final.csv', sep='$', index=False)


In [97]:
def get_pathway_name(pathway_id):
    url = f"https://rest.kegg.jp/get/pathway:{pathway_id}"
    response = requests.get(url)

    if response.status_code == 200:
        for line in response.text.split('\n'):
            if line.startswith("NAME"):
                pathway_name = line.split("NAME")[1].strip()
                return pathway_name
    return ""

def add_pw_names():

    # Input and output file names
    input_file = "/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison_formatted.csv"
    output_file = "/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison_final.csv"

    # Dictionary to store D_pathway-pathway_name pairs
    pathway_name_dict = {}

    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_reader = csv.reader(infile, delimiter='$')
        csv_writer = csv.writer(outfile, delimiter='$')
        
        # Add the new column header (assuming there's a header row)
        header = next(csv_reader)
        header.append("pathway_name")
        csv_writer.writerow(header)
        
        rowcount = 0
        for row in csv_reader:
            if rowcount % 100 == 0:
                print(f"Processed {rowcount} rows")
            rowcount += 1
            
            pathway_id = row[0]
            
            if pathway_id not in pathway_name_dict:
                pathway_name = get_pathway_name(pathway_id)
                pathway_name_dict[pathway_id] = pathway_name
                # replace commas with semicolons
                pathway_name = pathway_name.replace(",", ";")
            else:
                pathway_name = pathway_name_dict[pathway_id]
                # print("Found in cache")
            
            row.append(pathway_name)
            csv_writer.writerow(row)

add_pw_names()


Processed 0 rows
Processed 100 rows
Processed 200 rows
Processed 300 rows
Processed 400 rows
Processed 500 rows
Processed 600 rows
Processed 700 rows
Processed 800 rows


In [98]:
# read dp_comparison_final_with_names.csv into a dataframe
dp_comparison_final_with_names_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison_final.csv', sep='$', header=0)
# columns: D_pathway,Reaction,Chi_squared_0,Chi_squared_1,P_value_0,P_value_1,PRR_0,PRR_1,PRRR,pathway_name
# reorder columns: D_pathway,pathway_name,Reaction,Chi_squared_0,Chi_squared_1,P_value_0,P_value_1,PRR_0,PRR_1,PRRR
dp_comparison_final_with_names_df = dp_comparison_final_with_names_df[['D_pathway', 'pathway_name', 'Reaction', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1', 'PRRR']]
# write the dataframe to a csv file, sep = '$'
dp_comparison_final_with_names_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dp_comparison_final.csv', sep='$', index=False)

In [99]:
# read dg_comparison_final.csv into a dataframe
dg_comparison_final_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_final.csv', sep='$', header=0)

dg_comparison_final_df['DG_level'] = dg_comparison_final_df['KEGG_Dgroup'].apply(lambda x: x.split(';')[1].strip())
dg_comparison_final_df['DG_level'] = dg_comparison_final_df['DG_level'].apply(lambda x: x[:-1] if x.endswith(')') else x)

dg_comparison_final_df['KEGG_Dgroup'] = dg_comparison_final_df['KEGG_Dgroup'].apply(lambda x: x.split(';')[0].strip("'()"))

dg_comparison_final_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/dg_comparison_final_1.csv', sep='$', index=False)



In [100]:
# Read the CSV file
drug_comparison_final_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_formatted.csv', sep='$', header=0)

# replace all instances of the string ");" with the string ")," in the column 'D_group'
drug_comparison_final_df['D_group'] = drug_comparison_final_df['D_group'].str.replace(r"\);", "),")
drug_comparison_final_df['D_group'] = drug_comparison_final_df['D_group'].str.replace(";", ",")

drug_comparison_final_df_copy = drug_comparison_final_df.copy()

# drop columns 'D_group'$Chi_squared_0$Chi_squared_1$P_value_0$P_value_1$PRR_0$PRR_1
drug_comparison_final_df_copy = drug_comparison_final_df_copy.drop(columns=['D_group', 'Chi_squared_0', 'Chi_squared_1', 'P_value_0', 'P_value_1', 'PRR_0', 'PRR_1'])
drug_comparison_final_df_copy.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_final_0.csv', sep='$', index=False)


The default value of regex will change from True to False in a future version.



In [101]:
with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/brite_chem.json') as f:
    brite_chem = json.load(f)
with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/brite.json') as f:
    brite = json.load(f)
with open('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/brite_class.json') as f:
    brite_class = json.load(f)


brite_dict_list = [brite_chem, brite, brite_class]


In [108]:
dc_df = pd.read_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_final_0.csv', sep='$', header=0, index_col=None)

def find_paths(brite_dict, d_number):
    paths = []

    def traverse(node, current_path):
        if 'name' in node and d_number in node['name']:
            paths.append(current_path + [node['name']])
        elif 'children' in node:
            for child in node['children']:
                traverse(child, current_path + [node['name']])

    traverse(brite_dict, [])
    return paths


# capitalize the first letter in the column 'Drug' in dc_df
dc_df['Drug'] = dc_df['Drug'].str.capitalize()
# create a column 'group_paths' in dc_df
dc_df['group_paths'] = ''


# iterate through each row in dc_df
for index, row in dc_df.iterrows():

    drug_paths_brite = find_paths(brite, row['D_number'])
    drug_paths_brite_class = find_paths(brite_class, row['D_number'])
    drg = row['Drug']


    if len(drug_paths_brite) > 0:
        for sub_list in drug_paths_brite:
            # remove the first element in the list
            sub_list.pop(0)
            # remove the last element in the list
            sub_list.pop(-1)
            last_element = sub_list[-1]
            # if drg is in last_element, remove last element
            if drg in last_element:
                sub_list.pop(-1)


    if len(drug_paths_brite_class) > 0:
        for sub_list in drug_paths_brite_class:
            # remove the first element in the list
            sub_list.pop(0)
            # remove the last element in the list
            sub_list.pop(-1)
            last_element = sub_list[-1]
            # if drg is in last_element, remove last element
            if drg in last_element:
                sub_list.pop(-1)


    # combine the two lists
    drug_paths = drug_paths_brite + drug_paths_brite_class

    for list in drug_paths:
        while len(list) > 1:
            # remove the second element in the list
            list.pop(1)
    
    ind = 0
    if True:    
        for list in drug_paths:
            # if list is the same as a previous list, remove it
            while drug_paths.count(list) > 1:
                drug_paths.pop(ind)
            ind += 1

    # set the value of the column 'group_paths' to the list drug_paths
    dc_df.at[index, 'group_paths'] = drug_paths

dc_df = dc_df[dc_df.astype(str)['group_paths'] != '[]']
dc_df1 = dc_df.copy()

print(dc_df.head())


dc_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_final_1.csv', sep='$', index=False)


  D_number         Drug              Reaction  N_observed_hw    PRRR  \
0   D07704   Citalopram        Rhabdomyolysis              7  79.281   
1   D08410  Pravastatin         Hyponatraemia             19  47.598   
3   D03257  Trastuzumab          Hypokalaemia              7  40.980   
4   D00235     Atenolol  Condition aggravated              8  39.510   
5   D00354  Lamotrigine            Aggression              4  34.204   

                                         group_paths  
0  [[Neuropsychiatric agent], [Metabolizing enzym...  
1   [[Hypolipidemic agent], [Transporter substrate]]  
3                                 [[Antineoplastic]]  
4                           [[Cardiovascular agent]]  
5  [[Metabolizing enzyme substrate], [Neuropsychi...  


In [119]:
dc_df = dc_df1.copy()

dc_df['group_paths'] = dc_df['group_paths'].astype(str)

# drop rows with irrelevant reactions
dc_df = dc_df[dc_df['Reaction'] != 'Condition aggravated']
dc_df = dc_df[dc_df['Reaction'] != 'Drug abuse']
dc_df = dc_df[dc_df['Reaction'] != 'Accidental overdose']
dc_df = dc_df[dc_df['Reaction'] != 'Drug interaction']

dc_df['drug_count'] = dc_df.groupby(['Drug'])['Drug'].transform('count')
dc_df['reaction_count'] = dc_df.groupby(['Reaction'])['Reaction'].transform('count')
dc_df['group_paths_count'] = dc_df.groupby(['group_paths'])['group_paths'].transform('count')


# drop rows where the column 'reaction_count' is less than 5
# dc_df = dc_df[dc_df['reaction_count'] >= 5]

# drop rows where the column 'group_paths_count' is less than 5
#dc_df = dc_df[dc_df['group_paths_count'] >= 5]

# drop rows where the column 'drug_count' is less than 5
# dc_df = dc_df[dc_df['drug_count'] >= 5]

# drop duplicate rows
dc_df['group_paths'] = dc_df['group_paths'].apply(lambda x: ast.literal_eval(x))

# drop rows with PRRR < 200
dc_df = dc_df[dc_df['PRRR'] >= 20]
# drop rows with N_observed_hw < 5
dc_df = dc_df[dc_df['N_observed_hw'] >= 5]

# order by N_observed_hw descending
dc_df = dc_df.sort_values(by=['N_observed_hw'], ascending=False)


print(len(dc_df))

dc_df.to_csv('/Users/loaner/Documents/GitHub/Symbolic-Methods-FAERS-Project/text_files/drug_comparison_final_2.csv', sep='$', index=False)



18


In [120]:
import plotly.graph_objects as go

# Extract the drug IDs, drug names, reactions, group_paths, and number_observed values
drug_ids = dc_df["D_number"].tolist()
drug_names = dc_df["Drug"].tolist()
reactions = dc_df["Reaction"].tolist()
group_paths = dc_df["group_paths"].tolist()
number_observed = dc_df["N_observed_hw"].tolist()

# Process the group_paths and reactions for the Sankey diagram
labels = []
source = []
target = []
value = []

# Iterate through the drug IDs, drug names, reactions, and number_observed values
for drug_id, drug_name, reaction, group_path, observed in zip(drug_ids, drug_names, reactions, group_paths, number_observed):
    # Convert string representation of list to an actual list
    # group_path = eval(group_path)

    for path in group_path:
        # Create a new list for each drug group, drug, and reaction
        node_path = path + [drug_name, reaction]

        # Iterate through the node_path
        for i in range(len(node_path) - 1):
            # Add the nodes to the labels list
            if node_path[i] not in labels:
                labels.append(node_path[i])
            # Add the nodes to the labels list
            if node_path[i + 1] not in labels:
                labels.append(node_path[i + 1])

            source_idx = labels.index(node_path[i])
            target_idx = labels.index(node_path[i + 1])

            source.append(source_idx)
            target.append(target_idx)
            value.append(observed)

# Update the labels with node sizes
for idx, label in enumerate(labels):
    mysum = 0
    for i, source_idx in enumerate(source):
        if source_idx == idx:
            mysum += value[i]
    node_value = mysum
    
    if node_value == 0:
        # set the node value to the sum of values for the first row with that label as the reaction
        node_value = sum(value[j] for j, target_idx in enumerate(target) if labels[target_idx] == label)

    if node_value > 0:
        node_value = int(node_value)

    labels[idx] = f"{label}, {node_value}"
    labels[idx] = f"{label}, {node_value}"


# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=labels),
    link=dict(source=source, target=target, value=value)
)])
# bold font for the labels
fig.update_layout(title_text="Sankey Diagram of Drug Groups, Drugs, and Reactions \n PRRR > 20, N>5", width=1600, height=1200, font_size=16, font_family="Arial", title_font_size=24)
fig.show()
