## Create DB and analyze

In [6]:
import sqlite3
import csv
import math
from Bio import AlignIO
from collections import Counter
from collections import defaultdict

In [5]:
# CONNECT to the database
conn = sqlite3.connect('compar_gen_data.db')
cursor = conn.cursor()

In [7]:
# all functions

# Create tables
def create_tables():
    cursor = conn.cursor()
    
    # Create Gene table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS Gene (
            gene_id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene_name TEXT UNIQUE,
            target_seq TEXT;
        )
    ''')

    # Create Species table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS Species (
            species_id INTEGER PRIMARY KEY AUTOINCREMENT,
            species_name TEXT UNIQUE
        )
    ''')
    
    # Create Mutation table
            # Mutation.edit_distance is a number of nucleotide substitutions needed in codon to move from ancestral amino acid to variant aa
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS Mutation (
            mutation_id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene_id INTEGER,
            species_id INTEGER,
            position INTEGER,
            ancestral_residue TEXT,
            variant_residue TEXT,
            edit_distance INTEGER,
            FOREIGN KEY (gene_id) REFERENCES Gene(gene_id),
            FOREIGN KEY (species_id) REFERENCES Species(species_id)
        )
    ''')

    # Create DMS table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS DMS (
            dms_id INTEGER PRIMARY KEY AUTOINCREMENT,
            mutation_id INTEGER,
            score REAL,
            FOREIGN KEY (mutation_id) REFERENCES Mutation(mutation_id)
        )
    ''')

    # Create MSA table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS MSA (
            msa_id INTEGER PRIMARY KEY AUTOINCREMENT,
            mutation_id INTEGER,
            shannon_entropy REAL,
            jsd REAL,
            phylop REAL,
            phastcons REAL,
            gerp REAL,
            percentage_identity REAL,
            ci REAL,
            variant_percentage_residue REAL,
            FOREIGN KEY (mutation_id) REFERENCES Mutation(mutation_id)
        )
    ''')

    # Create IntegratedData table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS IntegratedData (
            integrated_data_id INTEGER PRIMARY KEY AUTOINCREMENT,
            mutation_id INTEGER,
            dms_score REAL,
            shannon_entropy REAL,
            jsd REAL,
            phylop REAL,
            phastcons REAL,
            gerp REAL,
            percentage_identity REAL,
            ci REAL,
            variant_percentage_residue REAL,
            FOREIGN KEY (mutation_id) REFERENCES Mutation(mutation_id),
            FOREIGN KEY (dms_score) REFERENCES DMS(score),
            FOREIGN KEY (shannon_entropy) REFERENCES MSA(shannon_entropy),
            FOREIGN KEY (jsd) REFERENCES MSA(jsd),
            FOREIGN KEY (phylop) REFERENCES MSA(phylop),
            FOREIGN KEY (phastcons) REFERENCES MSA(phastcons),
            FOREIGN KEY (gerp) REFERENCES MSA(gerp),
            FOREIGN KEY (percentage_identity) REFERENCES MSA(percentage_identity),
            FOREIGN KEY (ci) REFERENCES MSA(ci),
            FOREIGN KEY (variant_percentage_residue) REFERENCES MSA(variant_percentage_residue)
        )
    ''')

    # Create SummaryStatistics table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS SummaryStatistics (
            summary_id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene_id INTEGER,
            mean_conservation_score REAL,
            mean_discordance REAL,
            shannon_entropy REAL,
            FOREIGN KEY (gene_id) REFERENCES Gene(gene_id)
        )
    ''')
    conn.commit()

# Insert genes into the Gene table
def insert_genes(gene):
    for g in gene:
        cursor.execute('''
            INSERT OR IGNORE INTO Gene (gene_name)
            VALUES (?)
        ''', (g,))
    conn.commit()

# Parse Clustal W file and extract mutation data
def parse_clustalw(gene, file_path):
    alignment = AlignIO.read(file_path, "clustal")
    alignment_data = []
    for record in alignment:
        gene_species, sequence = record.id.split('/', 1)
        species_name = gene_species
        # Check if gene name already exists in Genes table
        for gene_name in [gene]: 
            cursor.execute('SELECT gene_id FROM Gene WHERE gene_name = ?', (gene_name,))
            result = cursor.fetchone()
            if result:
                gene_id = result[0]
            else:
                # Insert gene name into Genes table if it doesn't exist
                cursor.execute('INSERT INTO Gene (gene_name) VALUES (?)', (gene_name,))
                gene_id = cursor.lastrowid
        for position, (ancestral_residue, residue) in enumerate(zip(alignment[0].seq, record.seq), start=1):
            alignment_data.append((gene_id, species_name, position, ancestral_residue, residue))
    return alignment_data

# Insert mutation data into the Mutation table
def insert_alignment_data(alignment_data):
    for gene_id, species_name, position, ancestral_residue, residue in alignment_data:
        # Check if species name already exists in Species table
        cursor.execute('SELECT species_id FROM Species WHERE species_name = ?', (species_name,))
        result = cursor.fetchone()
        if result:
            species_id = result[0]
        else:
            # Insert species name into Species table if it doesn't exist
            cursor.execute('INSERT INTO Species (species_name) VALUES (?)', (species_name,))
            species_id = cursor.lastrowid
        # Insert mutation data into Mutation table
        cursor.execute('''
            INSERT INTO Mutation (gene_id, species_id, position, ancestral_residue, variant_residue)
            VALUES (?, ?, ?, ?, ?)
        ''', (gene_id, species_id, position, ancestral_residue, residue))
        # Get the last inserted mutation_id
        mutation_id = cursor.lastrowid
        
        # Insert mutation data into MSA table
        cursor.execute('''
            INSERT INTO MSA (mutation_id, shannon_entropy, jsd, phylop, phastcons, gerp, percentage_identity, ci)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (mutation_id, None, None, None, None, None, None, None))  # Replace None with your data
    conn.commit()

# Function to insert DMS data into the Mutation and DMS tables
def insert_dms_data(gene, species, file_path, col_position, col_ancestral, col_variant, col_score):
    # Open CSV file containing DMS data
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        dms_data = []
        for row in reader:
            position = int(row[col_position])
            ancestral_residue = row[col_ancestral]
            variant_residue = row[col_variant]
            try:
                score = float(row[col_score])
            except ValueError:
                #print(f"Skipping row with invalid score: {row}")
                continue  # skip invalid scores

            # Check if the species already exists in the Species table
            cursor.execute('SELECT species_id FROM Species WHERE species_name = ?', (species,))
            species_result = cursor.fetchone()
            if species_result:
                species_id = species_result[0]
            else:
                # Insert the species into the Species table if it doesn't exist
                cursor.execute('INSERT INTO Species (species_name) VALUES (?)', (species,))
                species_id = cursor.lastrowid
                
            # Check if the mutation already exists in the Mutation table
            cursor.execute('''
                SELECT mutation_id FROM Mutation 
                WHERE position = ? AND ancestral_residue = ? AND variant_residue = ?
            ''', (position, ancestral_residue, variant_residue))
            result = cursor.fetchone()
            if result:
                mutation_id = result[0]
            else:
                # Insert the mutation into the Mutation table if it doesn't exist
                cursor.execute('''
                    INSERT INTO Mutation (gene_id, species_id, position, ancestral_residue, variant_residue)
                    VALUES (
                        (SELECT gene_id FROM Gene WHERE gene_name = ?),
                        (SELECT species_id FROM Species WHERE species_name = ?),
                        ?, ?, ?
                    )
                ''', (gene, species, position, ancestral_residue, variant_residue))
                mutation_id = cursor.lastrowid
            # Append DMS data
            dms_data.append((mutation_id, score))
    
    # Insert DMS data into the DMS table
    cursor.executemany('''
        INSERT INTO DMS (mutation_id, score)
        VALUES (?, ?)
    ''', dms_data)
    conn.commit()

# Calculate the Shannon entropy of a sequence
def calculate_shannon_entropy(sequence):
    counts = Counter(sequence)
    total = len(sequence)
    entropy = -sum((count / total) * math.log2(count / total) for count in counts.values())
    return entropy

# Calculate Percentage Identity
def calculate_percentage_identity(residues, total_sequences):
    most_common_residue_count = Counter(residues).most_common(1)[0][1]
    percentage_identity = most_common_residue_count / total_sequences
    return percentage_identity

# Calculate variant_percentage_residue - % of each variant amino acid within the same position in MSA
def calculate_variant_percentage_residue(gene_id, position, variant_residue):
    cursor.execute('''
        SELECT COUNT(*)
        FROM Mutation
        WHERE gene_id = ? AND position = ? AND variant_residue = ?
    ''', (gene_id, position, variant_residue))
    variant_residue_count = cursor.fetchone()[0]
    
    cursor.execute('''
        SELECT COUNT(DISTINCT species_id)
        FROM Mutation
        WHERE gene_id = ? AND position = ?
    ''', (gene_id, position))
    total_unique_species_count = cursor.fetchone()[0]
    
    if total_unique_species_count == 0:
        return 0
    return variant_residue_count / total_unique_species_count

# Fill variant_percentage_residue in MSA table
def fill_variant_percentage_residue_in_msa():
    # Fetch all mutation data required for calculation
    cursor.execute('''
        SELECT Mutation.mutation_id, Mutation.gene_id, Mutation.position, Mutation.variant_residue
        FROM Mutation
        JOIN MSA ON Mutation.mutation_id = MSA.mutation_id
    ''')
    mutation_data = cursor.fetchall()

    # Calculate and update variant_percentage_residue for each mutation
    for mutation_id, gene_id, position, variant_residue in mutation_data:
        variant_percentage_residue = calculate_variant_percentage_residue(gene_id, position, variant_residue)
        
        cursor.execute('''
            UPDATE MSA
            SET variant_percentage_residue = ?
            WHERE mutation_id = ?
        ''', (variant_percentage_residue, mutation_id))
    
    conn.commit()

def update_consevation_scores():
    # Retrieve data from Mutation table
    cursor.execute('''
        SELECT Mutation.gene_id, Mutation.position, Mutation.ancestral_residue, Mutation.variant_residue, Species.species_name
        FROM Mutation
        JOIN MSA ON Mutation.mutation_id = MSA.mutation_id
        JOIN Species ON Mutation.species_id = Species.species_id
    ''')
    mutation_data = cursor.fetchall()
    
    # Group data by gene_id and position to calculate Shannon entropy
    gene_position_data = {}
    unique_species_count = {}
    for gene_id, position, ancestral_residue, variant_residue, species_name in mutation_data:
        if (gene_id, position) not in gene_position_data:
            gene_position_data[(gene_id, position)] = []
            unique_species_count[(gene_id, position)] = set()  # Store unique species for each (gene_id, position)
        gene_position_data[(gene_id, position)].append(variant_residue)
        unique_species_count[(gene_id, position)].add(species_name)  # Add species to set
        
    # Calculate conservation scores for each gene_id and position group, and update the MSA table
    for (gene_id, position), residues in gene_position_data.items():
        entropy = calculate_shannon_entropy(residues)
        total_sequences = len(unique_species_count[(gene_id, position)])
        percentage_identity = calculate_percentage_identity(residues, total_sequences)        
        
        cursor.execute('''
            UPDATE MSA
            SET shannon_entropy = ?, percentage_identity = ?
            WHERE mutation_id IN (
                SELECT mutation_id
                FROM Mutation
                WHERE gene_id = ? AND position = ?
            )
        ''', (entropy, percentage_identity, gene_id, position))
    conn.commit()

def fill_integrated_data():
    cursor.execute('''
        INSERT INTO IntegratedData (mutation_id, dms_score, shannon_entropy, jsd, phylop, phastcons, gerp, percentage_identity, ci, variant_percentage_residue)
        SELECT MSA.mutation_id, DMS.score, MSA.shannon_entropy, MSA.jsd, MSA.phylop, MSA.phastcons, MSA.gerp, MSA.percentage_identity, MSA.ci, MSA.variant_percentage_residue
        FROM MSA
        JOIN DMS ON MSA.mutation_id = DMS.mutation_id
    ''')
    conn.commit()

In [45]:
# CREATE tables
create_tables()

In [157]:
# SET PTEN variables
dms_species_name = 'Human'
gene_name = 'PTEN'
ancestral_sequence = (
    "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTC-----------------------NPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV-"
)

# MSA data
aln_file_path = 'data/Human_PTEN_ortholog_prot_alignment_zebrafish.aln'

# DMS data
dms_file_path = 'data/dms-PTEN-013-a-1_1letter.csv'
col_position = 'position'
col_ancestral = 'ancestral_residue'
col_variant = 'variant_residue'
col_score = 'score'

In [6]:
# SET PAX6 variables
dms_species_name = 'Human'
gene_name = 'PAX6 Homeobox domain'
ancestral_sequence = (
    "NRTSFTQEQIEALEKEFERTHYPDVFARERLAAKIDLPEARIQVWFSNRRAKWRR"
)

# MSA data
aln_file_path = 'data/Human_PAX6_Homeobox_domain_orthologues.aln'

# DMS data
dms_file_path = 'data/dms-PAX6-949-a-1-1letter.csv'
col_position = 'position'
col_ancestral = 'ancestral_residue'
col_variant = 'variant_residue'
col_score = 'score'

In [10]:
# SET CYP2C19 variables
dms_species_name = 'Human'
gene_name = 'CYP2C19'
ancestral_sequence = (
    "MDPFVVLVLCLSCLLLLSIWRQSSGRGKLPPGPTPLPVIGNILQIDIKDVSKSLTNLSKIYGPVFTLYFGLERMVVLHGYEVVKEALIDLGEEFSGRGHFPLAERANRGFGIVFSNGKRWKEIRRFSLMTLRNFGMGKRSIEDRVQEEARCLVEELRKTKASPCDPTFILGCAPCNVICSIIFQKRFDYKDQQFLNLMEKLNENIRIVSTPWIQICNNFPTIIDYFPGTHNKLLKNLAFMESDILEKVKEHQESMDINNPRDFIDCFLIKMEKEKQNQQSEFTIENLVITAADLLGAGTETTSTTLRYALLLLLKHPEVTAKVQEEIERVIGRNRSPCMQDRGHMPYTDAVVHEVQRYIDLIPTSLPHAVTCDVKFRNYLIPKGTTILTSLTSVLHDNKEFPNPEMFDPRHFLDEGGNFKKSNYFMPFSAGKRICVGEGLARMELFLFLTFILQNFNLKSLIDPKDLDTTPVVNGFASVPPFYQLCFIPV"
)

# MSA data
aln_file_path = 'data/Human_CYP2C19_orthologues.aln'

# DMS data
dms_file_path = 'data/dms-CYP2C19-1199-a-1.csv'
col_position = 'position'
col_ancestral = 'ancestral_residue'
col_variant = 'variant_residue'
col_score = 'score'

In [11]:
#FILL Mutation and MSA table from alignment file
alignment_data = parse_clustalw(gene_name, aln_file_path)
insert_alignment_data(alignment_data)

In [12]:
# CALCULATE conservation scores and INSERT in MSA
update_consevation_scores()

In [13]:
# FILL DMS data into the database
insert_dms_data(gene_name, dms_species_name, dms_file_path, col_position, col_ancestral, col_variant, col_score)

In [14]:
# FILL IntegratedData table
fill_integrated_data()

In [8]:
fill_variant_percentage_residue_in_msa()

In [8]:
# Edit_distance calculation
codon_table = {
    'F': ['TTT', 'TTC'],
    'L': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'I': ['ATT', 'ATC', 'ATA'],
    'M': ['ATG'],
    'V': ['GTT', 'GTC', 'GTA', 'GTG'],
    'S': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'P': ['CCT', 'CCC', 'CCA', 'CCG'],
    'T': ['ACT', 'ACC', 'ACA', 'ACG'],
    'A': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Y': ['TAT', 'TAC'],
    'H': ['CAT', 'CAC'],
    'Q': ['CAA', 'CAG'],
    'N': ['AAT', 'AAC'],
    'K': ['AAA', 'AAG'],
    'D': ['GAT', 'GAC'],
    'E': ['GAA', 'GAG'],
    'C': ['TGT', 'TGC'],
    'W': ['TGG'],
    'R': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'G': ['GGT', 'GGC', 'GGA', 'GGG'],
    '*': ['TAA', 'TAG', 'TGA']
}

def one_nucleotide_away(codon1, codon2):
    return sum(1 for a, b in zip(codon1, codon2) if a != b) == 1

amino_acid_mapping = defaultdict(list)

for aa1, codons1 in codon_table.items():
    for codon1 in codons1:
        for aa2, codons2 in codon_table.items():
            if aa1 != aa2:
                for codon2 in codons2:
                    if one_nucleotide_away(codon1, codon2):
                        amino_acid_mapping[aa1].append(aa2)
                        break
                        
amino_acid_mapping = dict(amino_acid_mapping)

# Fetch all mutations
cursor.execute('SELECT mutation_id, ancestral_residue, variant_residue FROM Mutation')
mutations = cursor.fetchall()

# Update edit_distance for each mutation
for mutation_id, ancestral_residue, variant_residue in mutations:
    if variant_residue in amino_acid_mapping.get(ancestral_residue, []):
        edit_distance = 1
    else:
        edit_distance = 'NA'
    
    cursor.execute('''
        UPDATE Mutation
        SET edit_distance = ?
        WHERE mutation_id = ?
    ''', (edit_distance, mutation_id))

conn.commit()


In [9]:
# temporary
def update_variant_percentage_residue_in_integrated_data():
    cursor.execute('''
        UPDATE IntegratedData
        SET variant_percentage_residue = (
            SELECT MSA.variant_percentage_residue
            FROM MSA
            WHERE MSA.mutation_id = IntegratedData.mutation_id
        )
        WHERE EXISTS (
            SELECT 1
            FROM MSA
            WHERE MSA.mutation_id = IntegratedData.mutation_id
        )
    ''')
    conn.commit()

# Run the function to update the IntegratedData table
update_variant_percentage_residue_in_integrated_data()

#cursor.execute('''    
#    UPDATE Gene
#    SET target_seq = "MDPFVVLVLCLSCLLLLSIWRQSSGRGKLPPGPTPLPVIGNILQIDIKDVSKSLTNLSKIYGPVFTLYFGLERMVVLHGYEVVKEALIDLGEEFSGRGHFPLAERANRGFGIVFSNGKRWKEIRRFSLMTLRNFGMGKRSIEDRVQEEARCLVEELRKTKASPCDPTFILGCAPCNVICSIIFQKRFDYKDQQFLNLMEKLNENIRIVSTPWIQICNNFPTIIDYFPGTHNKLLKNLAFMESDILEKVKEHQESMDINNPRDFIDCFLIKMEKEKQNQQSEFTIENLVITAADLLGAGTETTSTTLRYALLLLLKHPEVTAKVQEEIERVIGRNRSPCMQDRGHMPYTDAVVHEVQRYIDLIPTSLPHAVTCDVKFRNYLIPKGTTILTSLTSVLHDNKEFPNPEMFDPRHFLDEGGNFKKSNYFMPFSAGKRICVGEGLARMELFLFLTFILQNFNLKSLIDPKDLDTTPVVNGFASVPPFYQLCFIPV"
#    WHERE gene_name = 'CYP2C19';
#    ''')

# Update edit_distance for each mutation
#cursor.execute('SELECT mutation_id, ancestral_residue, variant_residue FROM Mutation')
#mutations = cursor.fetchall()
#for mutation_id, ancestral_residue, variant_residue in mutations:
#    if variant_residue in amino_acid_mapping.get(ancestral_residue, []):
#        edit_distance = 1
#    else:
#        edit_distance = 'NA'
#    
#    cursor.execute('''
#        UPDATE Mutation
#        SET edit_distance = ?
#        WHERE mutation_id = ?
#    ''', (edit_distance, mutation_id))


conn.commit()

In [None]:
# Close the connection
conn.close()