<img src="https://raw.githubusercontent.com/RuneROe/git_color_by_similarity/master/logo.png" height="200" align="right" style="height:240px">

##SIMalign

**How to:**
1.   Install dependencies be running first cell (takes some minutes). The code will restart the kernel so wait for this to happen before proceeding.
2.   Run the "Importing files" cell.
3.   A bottum saying "Choose Files" will appear. Press it and choose all the structures that you want to analyse (At least 3 structures).
4.   Type which structure you want to be your reference structure.
5.   Run the remaining cells.

In [None]:
#@title Install dependencies

import os
def get_script(script):
    raw_script = f"https://raw.githubusercontent.com/RuneROe/git_color_by_similarity/master/{script}.py"
    local_script_path = f"/content/{script}.py"
    os.system(f"wget {raw_script} -O {local_script_path}")

scripts = {"colors","foldseek","hotspot_finder","SIMalign","visualize_pdb"}
if not os.path.isfile("SIMalign_READY"):
    print("installing condacolab...")
    os.system("pip install -q condacolab")
    import condacolab
    condacolab.install()
    print("installing rdkit...")
    os.system("mamba install -c conda-forge rdkit")
    print("installing pymol...")
    os.system("mamba install -c conda-forge pymol-open-source")
    os.system("wget https://raw.githubusercontent.com/rdkit/rdkit/master/Docs/Book/data/cdk")
    os.system("which pymol")
    import subprocess
    cmd = subprocess.Popen(["pymol", "-cKRQ"])
    os.system("ps aux | grep pymol")
    os.system("pip install py3Dmol")
    os.system("git clone https://github.com/RuneROe/git_color_by_similarity.git")
    
    for s in scripts:
        get_script(s)
    os.system("touch SIMalign_READY")
    print("Done")
else:
    print("Dependendies already installed.")

In [None]:
#@title Importing files

#@markdown Import at least 3 files.
#@markdown Be aware that overfitting can occur if the amino acid sequences are too similar. It is recomanded that seqeunces of more than 80% similarity are removed before importing to the program.


from google.colab import files
import sys
import os



# Removing old uploads
OK_files = {".config", "condacolab_install.log", "__pycache__", "SIMalign_READY", "thermoDB", "foldseek_output", "sample_data"}.union(scripts)
for file in os.listdir():
    if file not in OK_files:
        os.remove(file)

# Wait until files are removed
while True:
    if set(os.listdir()).issubset(OK_files):
        break


# Checking if imported files are OK
infiles = files.upload()
infilenames = list(infiles.keys())
if len(infilenames) < 3:
    print("ERROR: Import at least 3 files.")
    sys.exit(1)
for file in infilenames:
    if " " in file:
        print("ERROR: No spaces in the names of the uploaded files are allowed.")
        sys.exit(1)
ref_structure = infilenames[0]


print("Choose a reference structure:")

# Prompt the user to choose a file
infile_set = set(infilenames)
while True:
    choice = input("Reference: ")
    number = 0
    for file in infilenames:
        if file.startswith(choice):
            number += 1
            ref_structure = file
    if number == 1:
        print(f'Selected reference structure: {ref_structure}')
        break
    elif number > 1:
        print("Not unique choice. Please enter full file name or remove files of similar names.")
    else:
        print("Invalid choice. Please enter a name of a file.")



In [None]:
#@title Run prediction
import SIMalign

max_iterations = 3 #@param {type:"integer"}
#@markdown `max_iterations` is the maximum number of alignments. A high number can lead to slow runtime.
min_aligned_aa = 100 #@param {type:"integer"}
#@markdown `min_aligned_aa` is how many amino acid that minimum should be used for alignment. A low number can lead to overfitting.
max_dist = 6 #@param {type:"integer"}
#@markdown `max_dist` is the maximum length between to amino acids before it is considered as a gab in the alignment. A too low number can lead to false gabs and a too high number can lead to false positive.
# remove_chain_duplicate = True #@param {type:"boolean"}
# For now it only takes chain A
#@markdown If `remove_chain_duplicate` is true then is chain duplicates removed from the structure.
download_pymol = True #@param {type: "boolean"}
outfile_name = "outfile" #param {type:"string"}
#@markdown Choose name of the outfile.

#Ting der skal ændres på:::
color_mode = "similarity" #@param ["similarity", "hotspot", "none"] {type:"raw"}
#@markdown If `color_by_similarity` is true then will the structures in the resulting pymol file be colored based on similarity from the SIMalign algorithm.
foldseek = False #@param {type:"boolean"}
foldseek_database = "Alphafold/UniProt" #@param ["Alphafold/UniProt","Alphafold/UniProt50-minimal","Alphafold/UniProt50","Alphafold/Proteome","Alphafold/Swiss-Prot","ESMAtlas30","PDB","Thermophilic_DB"] {type:"raw"}
foldseek_variable_tresshold = "evalue" #@param ["evalue","pident","fident","nident","alnlen","bits","mismatch","qcov","tcov","lddt","qtmscore","ttmscore","alntmscore","rmsd","prob"]
foldseek_value_tresshold = 0.0001  #@param {type:"number"}
foldseek_search_against = "ref_structure" #@param ["ref_structure","all_structures"]

find_hotspots = True #@param {type: "boolean"}


# removing spaces from outfile and add .pse
outfile_name = "_".join(outfile_name.split(" "))+".pse"

if foldseek:
    import foldseek_search
    foldseek_search.run(foldseek_database,foldseek_variable_tresshold,foldseek_value_tresshold,foldseek_search_against,ref_structure,infilenames)
len_ref_structure, score_list, structure_list = SIMalign.run(ref_structure, infilenames, max_iterations, min_aligned_aa, max_dist, remove_chain_duplicate, outfile_name)

if find_hotspots:
    import hotspot_finder
    hotspot_list = hotspot_finder.run(score_list,structure_list,minmax=[0.3,0.7],max_dist=10)
else:
    hotspot_list = None

if color_mode != "none":
    if color_mode == "hotspot" and find_hotspots == False:
        print("ERROR: unable to color hotspot without finding them!")
    else:
        import colors
        colors.run(color_mode,hotspot_list,score_list,outfile_name,structure_list)

if download_pymol:
    files.download(outfile_name)
else:
    print("Done")

In [None]:
#@title Display reference structure
#@markdown Reference structure needs to be a pdb file in order to visualize.

import visualize_pdb
visualize_pdb.run()