# Imports & Datasets

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
## Note 1.
hiefData = \
    pd.read_csv(r'..\data\hiefData.csv', comment='#')
usVeteransFKBP5 = \
    pd.read_csv(r'..\data\usVeteransFKBP5.csv', comment='#')
ptsdWorldMentalHealthSurvey = \
    pd.read_csv(r'..\data\ptsdWorldMentalHealthSurvey.csv', comment='#')
healthCarePriceIndex = \
    pd.read_csv(r'..\data\healthCarePriceIndex.csv', comment='#')
## Note 2.
rs1360780Data = \
    pd.read_csv(r'..\data\rs1360780_frequency.tsv', sep='\t', 
                skiprows=12, header=0).drop(columns=['#Study'])
rs3800373Data = \
    pd.read_csv(r'..\data\rs3800373_frequency.tsv', sep='\t',
                skiprows=12, header=0).drop(columns=['#Study'])
rs9296158Data = \
    pd.read_csv(r'..\data\rs9296158_frequency.tsv', sep='\t',
                skiprows=12, header=0).drop(columns=['#Study'])
rs9470080Data = \
    pd.read_csv(r'..\data\rs9470080_frequency.tsv', sep='\t',
                skiprows=12, header=0).drop(columns=['#Study'])
## Note 3.
WorldValuesSurveyData = \
    pd.read_csv(r'..\data\WVS_Cross-National_Wave_7_inverted_csv_v6_0.csv', 
                low_memory=False).fillna(0)

# Applied Analysis

In [3]:
countryScope = {
    "COL": ["Colombia", "Colombia (Medellin)"],
    "IRQ": ["Iraq"],
    "PER": ["Peru"],
    "CHN": ["PRC China", "China"],
    "UKR": ["Ukraine"],
    "BRA": ["Brazil"],
    "BGR": ["Bulgaria"],
    "MEX": ["Mexico"],
    "ROU": ["Romania"],
    "ZAF": ["South Africa"],
    "AUS": ["Australia"],
    "BEL": ["Belgium"],
    "DEU": ["Germany", "German Federal Republic"],
    "ISR": ["Israel"],
    "ITA": ["Italy"],
    "JPN": ["Japan"],
    "NZL": ["New Zealand"],
    "GBR": ["United Kingdom", "Northern Ireland"],
    "PRT": ["Portugal"],
    "ESP": ["Spain", "Spain (Murcia)"],
    "NLD": ["The Netherlands", "Netherlands"],
    "USA": ["The USA", "USA", "United States of America"]
}

# Here we're creating a function to standardize the names of our countries and regions to the target countries ISO code.
# The function is referencing the countryScope dictionary allowing us to normalize the country name for merges.

In [4]:
import pandas as pd
class Countryparser:
    def standardizeCountryName(data_Frame, country_Col, countryScope):
        # Creating an empty dictionary who's key is the country name, and who's value is the corresponding ISO code.
        standardizedCountries = {}
        # This loops through each key-value pair in the country_scope dictionary. iso_code is the key (the ISO country code), and names is the value (a list of country names).
        for iso_Code, names in countryScope.items():
            for name in names:
                standardizedCountries[name] = iso_Code
        # This maps the country names in the specified column of the DataFrame to their corresponding ISO codes using the standardized_countries dictionary.
        # The .fillna() method in this this case ensures that any country name not found in the countryScope remains unchanged.
        data_Frame[country_Col] = data_Frame[country_Col].map(standardizedCountries).fillna(data_Frame[country_Col])
        return data_Frame


class Bioparser:
    @staticmethod
    def splitAlleleToDict(data_Frame, ref_allele_col='Ref Allele', alt_allele_col='Alt Allele', population_col='Population'):
        def splitAllele(allele):
            try:
                letter, value = allele.split('=', 1)
                return letter, float(value)
            except ValueError:
                return allele, None  # Handle unexpected formats

        # Apply the function to split 'Ref Allele' and 'Alt Allele'
        data_Frame[['Ref Allele Letter', 'Ref Allele Value']
                   ] = data_Frame[ref_allele_col].apply(splitAllele).tolist()
        data_Frame[['Alt Allele Letter', 'Alt Allele Value']
                   ] = data_Frame[alt_allele_col].apply(splitAllele).tolist()

        # Drop the original 'Ref Allele' and 'Alt Allele' columns
        data_Frame = data_Frame.drop(columns=[ref_allele_col, alt_allele_col])

        # Initialize the dictionary
        populationAlleleFrequency = {}
        populationAlleleFrequency = {}
        populationAlleleFrequency = {}
        populationAlleleFrequency = {}

        # Group by 'Population' and calculate the mean of 'Ref Allele Value' and 'Alt Allele Value'
        allele_grouped = data_Frame.groupby([population_col]).agg({
            'Ref Allele Value': 'mean',
            'Alt Allele Value': 'mean'
        }).reset_index()

        # Calculate genotype frequencies using Hardy-Weinberg equilibrium and populate the dictionary
        for _, row in allele_grouped.iterrows():
            population = row[population_col]
            p = row['Ref Allele Value']
            q = row['Alt Allele Value']

            if pd.notna(p) and pd.notna(q):
                ref_ref = p ** 2
                alt_alt = q ** 2
                ref_alt = 2 * p * q

                hwe = ref_ref + alt_alt + ref_alt

                populationAlleleFrequency[population] = {
                    'Ref Allele': p,
                    'Alt Allele': q,
                    'Expected Ref/Ref': ref_ref,
                    'Expected Alt/Alt': alt_alt,
                    'Expected Ref/Alt': ref_alt,
                    'HWE': hwe
                }
        return populationAlleleFrequency


# Example usage
result = Bioparser.splitAlleleToDict(rs1360780Data)
print(result)

{'ACPOP': {'Ref Allele': 0.283, 'Alt Allele': 0.717, 'Expected Ref/Ref': 0.08008899999999998, 'Expected Alt/Alt': 0.5140889999999999, 'Expected Ref/Alt': 0.40582199999999996, 'HWE': 0.9999999999999998}, 'Africa': {'Ref Allele': 0.434, 'Alt Allele': 0.566, 'Expected Ref/Ref': 0.188356, 'Expected Alt/Alt': 0.3203559999999999, 'Expected Ref/Alt': 0.49128799999999995, 'HWE': 0.9999999999999999}, 'African': {'Ref Allele': 0.41395400000000004, 'Alt Allele': 0.5851824999999999, 'Expected Ref/Ref': 0.17135791411600004, 'Expected Alt/Alt': 0.3424385583062499, 'Expected Ref/Alt': 0.48447727321, 'HWE': 0.9982737456322499}, 'AfricanAmerican': {'Ref Allele': 0.41254, 'Alt Allele': 0.58746, 'Expected Ref/Ref': 0.1701892516, 'Expected Alt/Alt': 0.3451092516, 'Expected Ref/Alt': 0.4847014968, 'HWE': 1.0}, 'America': {'Ref Allele': 0.204, 'Alt Allele': 0.796, 'Expected Ref/Ref': 0.04161599999999999, 'Expected Alt/Alt': 0.6336160000000001, 'Expected Ref/Alt': 0.324768, 'HWE': 1.0}, 'American': {'Ref All

# Applied Logistic Regression Model

# Reference Notes
1. Declaring comments within each CSV via the `comment=` command.
2. TSV files can be read via the CSV reader, however they must delineate that the columns are separated by tabs rather than commas via the `sep='\t'` command. Likewise we are skipping the first 12 rows of the referenced file due to the odd file structure and are setting the header to the index of line we just skipped, which in our case would be 0.
3. CSV parameter low memory was set to false in order to resolve mixed type issue occurring due to file being processed in chunks rather than a whole.