# Example of using mmtfPyspark to find water interactions


## Imports and variables

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.interactions import InteractionFilter, GroupInteractionExtractor, ExcludedLigandSets
from mmtfPyspark.filters import ContainsLProteinChain, Resolution
import matplotlib.pyplot as plt
import pandas as pd
import py3Dmol
import time                                

#### Configure Spark 

In [2]:
spark = SparkSession.builder.appName("WaterInteractionsExample").getOrCreate()

## Define Variables

In [3]:
# input parameters
path = "../../resources/mmtf_full_sample/"
resolution = 2.0
minInteractions = 2
maxInteractions = 4
distanceCutoff = 3.0
bFactorCutoff = 1.645
includeWaters = True

## Read PDB and filter by resolution and only include proteins

In [4]:
pdb = mmtfReader.read_sequence_file(path)
pdb = pdb.filter(Resolution(minResolution=0.0, maxResolution=2.0))\
         .filter(ContainsLProteinChain(exclusive=True))

## Setup criteria for metal interactions

In [5]:
interactions_filter = InteractionFilter()
interactions_filter.set_distance_cutoff(3.0)
interactions_filter.set_normalized_b_factor_cutoff(1.645)
interactions_filter.set_min_interactions(2)
interactions_filter.set_max_interactions(4)
interactions_filter.set_query_groups(True, ["HOH"])
interactions_filter.set_query_elements(True, "O")    # Only use water oxygen
interactions_filter.set_target_elements(True, ["O", "N", "S"])

## Exclude "uninteresting" ligands 

In [6]:
prohibitedGroups = ExcludedLigandSets.ALL_GROUPS
if not includeWaters:
    prohibitedGroups.add("HOH")
interactions_filter.set_prohibited_target_groups(prohibitedGroups)

## Calculate interactions

In [7]:
data = GroupInteractionExtractor().get_interactions(structures=pdb, interactionFilter=interactions_filter)

## Define Filter Bridging Water Interactions Function

In [8]:
def filter_bridging_water_interactions(data, maxInteractions):
    if maxInteractions == 2:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO"))
    elif maxInteractions == 3:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO"))
    elif maxInteractions == 4:
        data = data.filter((col("type1") == "LGO") | \
                           (col("type2") == "LGO") | \
                           (col("type3") == "LGO") | \
                           (col("type4") == "LGO"))
        data = data.filter((col("type1") == "PRO") | \
                           (col("type2") == "PRO") | \
                           (col("type3") == "PRO") | \
                           (col("type4") == "PRO"))
    else:
        raise ValueError("maxInteractions > 4 are not supported yet")
    return data

## Keep only interactions with at least one organic ligand and one protein interaction

In [9]:
data = filter_bridging_water_interactions(data, maxInteractions=4).cache()

print(f"Hits(all): {data.count()}")
data = data.toPandas()
data.head(50)

Hits(all): 1944


Unnamed: 0,pdbId,polyChains,q3,q4,q5,q6,atom0,element0,group0,groupNum0,...,type4,chain4,nbFactor4,distance4,angle1-2,angle1-3,angle1-4,angle2-3,angle2-4,angle3-4
0,1LH0,2,0.771533,0.824166,,,O,O,HOH,3001,...,LGO,B,-0.671472,2.844147,1.568575,2.198814,1.692013,2.333368,2.023762,1.575645
1,1LH0,3,0.683459,0.763064,,,O,O,HOH,3020,...,PRO,B,-0.510482,2.643836,1.617314,2.432483,1.373324,1.996287,1.714797,2.149621
2,1LO7,1,0.970131,0.857381,,,O,O,HOH,275,...,LGO,A,-0.433365,2.485262,1.937859,2.248852,2.025202,1.891371,1.907959,1.372001
3,1LO7,1,,,,,O,O,HOH,293,...,,,,0.0,2.052202,,,,,
4,1LRI,1,,,,,O,O,HOH,105,...,,,,0.0,1.914519,,,,,
5,4XJ5,1,0.750859,,,,O,O,HOH,1184,...,,,,0.0,1.477401,2.15869,2.473017,0.0,,
6,4XJ5,1,0.804155,0.852276,,,O,O,HOH,1207,...,PRO,A,-1.116207,2.941605,1.903873,1.518106,1.532091,2.196531,2.141196,1.899779
7,4XJ5,1,0.905107,,,,O,O,HOH,1279,...,,,,0.0,2.22085,1.727812,1.883765,0.0,,
8,4XJ5,2,0.684056,,,,O,O,HOH,1286,...,,,,0.0,1.903365,1.931156,1.361409,0.0,,
9,4XP7,2,0.854735,,,,O,O,HOH,586,...,,,,0.0,1.740899,1.731324,1.906382,0.0,,


## Terminate Spark

In [10]:
spark.stop()