#### Comparative Genome analysis PTEN
- DMS scores used: https://www.mavedb.org/score-sets/urn:mavedb:00000013-a-1
- Alignment used: https://www.ensembl.org/Homo_sapiens/Gene/Compara_Ortholog/Alignment?db=core;g=ENSG00000171862;g1=ENSDARG00000056623;hom_id=202046236;r=10:87862638-87971930

In [4]:
# CREATE db with tables
import sqlite3

conn = sqlite3.connect('compar_gen_data_initial.db')

def create_tables():
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS MSA (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene TEXT,
            species TEXT,
            position INTEGER,
            ancestral_residue TEXT,
            residue TEXT
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS DMS (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene TEXT,
            species TEXT,
            position INTEGER,
            ancestral_residue TEXT,
            residue TEXT,
            score REAL
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS IntegratedData (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene TEXT,
            species TEXT,
            position INTEGER,
            ancestral_residue TEXT,
            residue TEXT,
            dms_score REAL,
            conservation_score REAL
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS SummaryStatistics (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            gene TEXT,
            mean_conservation_score REAL,
            mean_discordance REAL,
            shannon_entropy REAL
        )
    ''')
    
    conn.commit()

create_tables()


In [6]:
ancestral_sequence = (
    "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTC-----------------------NPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV-"
)

In [7]:
# FILL MSA table
import sqlite3
from Bio import AlignIO
ancestral_sequence = (
    "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTC-----------------------NPQFVVCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV-"
)
conn = sqlite3.connect('compar_gen_data_initial.db')
cursor = conn.cursor()

# function - parse the CLUSTAL W file
def parse_clustalw(file_path):
    alignment = AlignIO.read(file_path, "clustal")
    return alignment

# function - insert MSA data into the database
def insert_msa_data(alignment, gene, ancestral_sequence):
    for record in alignment:
        species = record.id #.split('_')[1]  # Extract species from the identifier
        sequence = str(record.seq)
        for position, residue in enumerate(sequence, start=1):
            if position <= len(ancestral_sequence):
                ancestral_residue = ancestral_sequence[position-1]  # Get ancestral residue
            else:
                ancestral_residue = '-'  # Default to '-' if position exceeds ancestral sequence length
            cursor.execute('''
                INSERT INTO MSA (gene, species, position, ancestral_residue, residue)
                VALUES (?, ?, ?, ?, ?)
            ''', (gene, species, position, ancestral_residue, residue))
    conn.commit()

# path to the CLUSTAL W alignment file
file_path = 'data/Human_PTEN_ortholog_prot_alignment_zebrafish.aln'
alignment = parse_clustalw(file_path)

# Insert data into the MSA table
gene_name = 'PTEN'
insert_msa_data(alignment, gene_name, ancestral_sequence)

conn.close()


In [8]:
three_to_one_letter = {
    'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E', 'Phe': 'F', 'Gly': 'G', 
    'His': 'H', 'Ile': 'I', 'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N', 
    'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S', 'Thr': 'T', 'Val': 'V', 
    'Trp': 'W', 'Tyr': 'Y', 'Ter': '*'
}

In [9]:
#FILL DMS table
import csv
import sqlite3
import re

three_to_one_letter = {
    'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E', 'Phe': 'F', 'Gly': 'G', 
    'His': 'H', 'Ile': 'I', 'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N', 
    'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S', 'Thr': 'T', 'Val': 'V', 
    'Trp': 'W', 'Tyr': 'Y', 'Ter': '*'
}

conn = sqlite3.connect('compar_gen_data_initial.db')
cursor = conn.cursor()

# function - convert three-letter amino acid to one-letter code
def three_to_one(three_letter):
    return three_to_one_letter.get(three_letter, '?')

# function - parse HGVS notation
def parse_hgvs(hgvs):
    match = re.match(r'p\.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2}|-|=)', hgvs)
    if match:
        ancestral_residue = three_to_one(match.group(1)) if match.group(1) != '-' else '-'  
        position = int(match.group(2))  # position
        residue = three_to_one(match.group(3)) if match.group(3) not in ['-', '='] else match.group(3)  
        return (ancestral_residue, position, residue)
    else:
        print(f"Invalid HGVS format: {hgvs}")
        return None

def load_csv_to_dms_table(csv_file):
    with open(csv_file, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            hgvs_pro = row['hgvs_pro']
            score = float(row['score'])
            parsed_hgvs = parse_hgvs(hgvs_pro)
            if parsed_hgvs:
                ancestral_residue, position, residue = parsed_hgvs
                cursor.execute('''
                    INSERT INTO DMS (gene, species, position, ancestral_residue, residue, score)
                    VALUES (?, ?, ?, ?, ?, ?)
                ''', ("PTEN", "HomoSapiens", position, ancestral_residue, residue, score))
            else:
                print(f"Skipping invalid HGVS format: {hgvs_pro}")

    conn.commit()

# Load the CSV file into the DMS table
load_csv_to_dms_table('data/urn_mavedb_00000013-a-1_scores_manip_nogaps.csv') # hgvs_pro are adjusted manually to coincide with 
                                                                        # positions in CLASTAL W .aln file
                                                                        # preprocessed table is here https://docs.google.com/spreadsheets/d/1FuAuMWKCc54ZMKgO_mOCTM2xF0c2T3fwaFRONm1DP4E/edit#gid=1882680443
                                                                        # Also in .aln file there are positions with gaps from 212 to 235
                                                                        # They will be added in the next step (also manuallly)
                                                                        # This step should be generalized to ensure that is works for another gene
# Manually insert 23 rows with gaps into the DMS table
for position in range(212, 235):
    cursor.execute('''
        INSERT INTO DMS (gene, species, position, ancestral_residue, residue, score)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', ("PTEN", "HomoSapiens", position, '-', '-', 0.0))

conn.commit()

conn.close()


In [10]:
# FILL DMS table (part 2)
import sqlite3

conn = sqlite3.connect('compar_gen_data_initial.db')
cursor = conn.cursor()

# Manually insert 23 rows with gaps into the DMS table
for position in range(212, 235):
    cursor.execute('''
        INSERT INTO DMS (gene, species, position, ancestral_residue, residue, score)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', ("PTEN", "HomoSapiens", position, '-', '-', 0.0))

conn.commit()

conn.close()

In [11]:
# Calculate and add Shannon entropy as conservation scores in IntegratedData
import sqlite3
import numpy as np

conn = sqlite3.connect('compar_gen_data_initial.db')
cursor = conn.cursor()

# function - calculate Shannon entropy
def shannon_entropy(frequencies):
    entropy = -sum(p * np.log(p) for p in frequencies if p != 0)
    return entropy

# distinct positions from MSA table
cursor.execute("SELECT DISTINCT position FROM MSA")
positions = cursor.fetchall()

for position in positions:
    position = position[0]  # position value
    # all residues for this position
    cursor.execute("SELECT residue, COUNT(*) FROM MSA WHERE position = ? GROUP BY residue", (position,))
    residue_counts = cursor.fetchall()
    
    total_residues = sum(count for _, count in residue_counts) # calculate total count of residues
    frequencies = [count / total_residues for _, count in residue_counts]   # calculate frequencies of each residue
    entropy = shannon_entropy(frequencies) # calculate Shannon entropy for this position
    
    # Insert data into IntegratedData table (Shannon entropy as conservation score in IntegratedData table)
    cursor.execute('''
        INSERT INTO IntegratedData (gene, species, position, ancestral_residue, residue, dms_score, conservation_score)
        SELECT gene, species, position, ancestral_residue, residue, NULL, ?
        FROM MSA WHERE position = ?
    ''', (entropy, position))

conn.commit()
conn.close()


In [12]:
# Add DMS scores to IntegratedData
import sqlite3

conn = sqlite3.connect('compar_gen_data_initial.db')
cursor = conn.cursor()

# Update IntegratedData.dms_score based on matching positions and ancestral residues
cursor.execute('''
    UPDATE IntegratedData
    SET dms_score = (
        SELECT score
        FROM DMS
        WHERE DMS.position = IntegratedData.position
        AND DMS.ancestral_residue = IntegratedData.ancestral_residue
    )
''')

conn.commit()
conn.close()

In [13]:
# Correaltion coeffitients calculations
import sqlite3
import pandas as pd

conn = sqlite3.connect('compar_gen_data_initial.db')

query = "SELECT dms_score, conservation_score FROM IntegratedData WHERE species = 'ENSP00000361021_Hsap/1-403' AND dms_score IS NOT NULL AND conservation_score IS NOT NULL"
df = pd.read_sql_query(query, conn)
df = df.dropna() # filter out rows with NaN values
pearson_corr = df['dms_score'].corr(df['conservation_score'], method='pearson')
spearman_corr = df['dms_score'].corr(df['conservation_score'], method='spearman')

print("Pearson correlation coefficient:", pearson_corr)
print("Spearman correlation coefficient:", spearman_corr)

conn.close()


Pearson correlation coefficient: -0.1372678345797904
Spearman correlation coefficient: -0.07174764712820665


## Results
- Pearson correlation coefficient: -0.1372678345797904
- Spearman correlation coefficient: -0.07174764712820665

*Pearson correlation coefficient* measures the linear relationship between two variables. It ranges from -1 to 1, where:
- 1 indicates a perfect positive linear relationship,
- 0 indicates no linear relationship, and
- -1 indicates a perfect negative linear relationship.
  
*Spearman correlation coefficient* measures the monotonic relationship between two variables. It ranges from -1 to 1. Spearman correlation is less sensitive to outliers and does not assume a linear relationship.

#### Analysed data
- DMS scores used: https://www.mavedb.org/score-sets/urn:mavedb:00000013-a-1
- Alignment used: https://www.ensembl.org/Homo_sapiens/Gene/Compara_Ortholog/Alignment?db=core;g=ENSG00000171862;g1=ENSDARG00000056623;hom_id=202046236;r=10:87862638-87971930
  
**In 'PTEN' pairwise alignment *Homo-sapiens* and *Zebrafish* with DMS scores from VAMP-seq experiment and Shennon entropy as conservation score**:
- a **Pearson coefficient of -0.137 indicates a weak negative linear relationship between the dms_score and conservation_score** variables for the specified species. This means that as one variable increases, the other tends to decrease slightly, but the relationship is not very strong.
- **Spearman coefficient of -0.0717 indicates a weak negative monotonic relationship between the dms_score and conservation_score** variables for the specified species. This suggests that as one variable increases, the other tends to decrease slightly, but the relationship is not very strong.

In [3]:
import sqlite3

def clean_table(cursor, conn, table_name):
    cursor.execute(f'DELETE FROM {table_name}')
    conn.commit()
    print(f'Table {table_name} has been cleaned.')

def fetch_and_print(cursor, table_name, order_by=None):
    query = f'SELECT * FROM {table_name}'
    if order_by:
        query += f' ORDER BY {order_by}'
    cursor.execute(query)
    rows = cursor.fetchall()
    print(f"\n{table_name} Table:")
    for row in rows:
        print(row)

def main():
    table_names = ['IntegratedData'] #['MSA', 'DMS', 'IntegratedData', 'SummaryStatistics']
    order_by_column = {'MSA': 'position', 'DMS': 'position'}

    conn = sqlite3.connect('compar_gen_data_initial.db')
    cursor = conn.cursor()

    # Clean tables
    #for table_name in table_names:
    #    clean_table(cursor, conn, table_name)

    # Print table content
    for table_name in table_names:
        order_by = order_by_column.get(table_name)
        fetch_and_print(cursor, table_name, order_by)

    conn.close()

if __name__ == "__main__":
    main()



IntegratedData Table:
(1, 'PTEN', 'ENSP00000361021_Hsap/1-403', 1, 'M', 'M', 1.065143375, 0.0)
(2, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 1, 'M', 'M', 1.065143375, 0.0)
(3, 'PTEN', 'ENSP00000361021_Hsap/1-403', 2, 'T', 'T', 0.8243587146, 0.6931471805599453)
(4, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 2, 'T', 'A', 0.8243587146, 0.6931471805599453)
(5, 'PTEN', 'ENSP00000361021_Hsap/1-403', 3, 'A', 'A', 0.9555524032, 0.0)
(6, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 3, 'A', 'A', 0.9555524032, 0.0)
(7, 'PTEN', 'ENSP00000361021_Hsap/1-403', 4, 'I', 'I', 1.326547924, 0.0)
(8, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 4, 'I', 'I', 1.326547924, 0.0)
(9, 'PTEN', 'ENSP00000361021_Hsap/1-403', 5, 'I', 'I', 1.12736553, 0.0)
(10, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 5, 'I', 'I', 1.12736553, 0.0)
(11, 'PTEN', 'ENSP00000361021_Hsap/1-403', 6, 'K', 'K', 1.224574614, 0.0)
(12, 'PTEN', 'ENSDARP00000073594_Drer/1-422', 6, 'K', 'K', 1.224574614, 0.0)
(13, 'PTEN', 'ENSP00000361021_Hsap/1-403', 7, 'E