# Advanced Zinc Interaction Analysis Example

<img src="./figures/zinc_interaction.png" style="width: 300px;"/>

## Imports

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import *
from mmtfPyspark.datasets import groupInteractionExtractor
from mmtfPyspark.io import MmtfReader
from mmtfPyspark.webfilters import blastCluster

## Configure Spark

In [2]:
conf = SparkConf().setMaster("local[*]") \
                  .setAppName("advancedZincInteractionDemo")

sc = SparkContext(conf = conf)

## Read PDB in MMTF format

In [3]:
path = "../../resources/mmtf_full_sample/"

pdb = MmtfReader.readSequenceFile(path, sc)

# Use only representative structures

In [4]:
sequenceIdentity = 40

pdb = pdb.filter(blastCluster(sequenceIdentity))

## Extract proteins with Zn interactions

In [5]:
finder = groupInteractionExtractor("ZN",3)

interactions = finder.getDataset(pdb).cache()

## List the top 10 residue types that interact with Zn

In [12]:
interactions.printSchema()

interactions.show(20)

n = interactions.count()

print(f"Number of interactions: {n}")

root
 |-- structureId: string (nullable = false)
 |-- residue1: string (nullable = false)
 |-- atom1: string (nullable = false)
 |-- element1: string (nullable = false)
 |-- index1: integer (nullable = false)
 |-- residue2: string (nullable = false)
 |-- atom2: string (nullable = false)
 |-- element2: string (nullable = false)
 |-- index2: integer (nullable = false)
 |-- distance: float (nullable = false)

+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|structureId|residue1|atom1|element1|index1|residue2|atom2|element2|index2| distance|
+-----------+--------+-----+--------+------+--------+-----+--------+------+---------+
|       5B4E|      ZN|   ZN|      Zn|   303|     CYS|   SG|       S|     1|2.1902573|
|       5B4E|      ZN|   ZN|      Zn|   303|     CYS|   SG|       S|     4| 2.206233|
|       5B4E|      ZN|   ZN|      Zn|   303|     CYS|   SG|       S|    20|2.2135682|
|       5B4E|      ZN|   ZN|      Zn|   303|     HIS|  ND1|       N|    23

## Show the top 10 interacting group/atom types

#### Exclude Carbon Interactions

In [10]:
topGroupsAndAtoms = interactions.filter("element2 != 'C'") \
                                .groupBy("residue2","atom2") \
                                .count()

#### Add column with frequency of occurence
#### Filter out occurrences < 1% 
#### Sort descending

In [13]:
topGroupsAndAtoms.withColumn("frequency", topGroupsAndAtoms["count"] / n) \
                 .filter("frequency > 0.01") \
                 .sort("frequency", ascending = False) \
                 .show(20)

+--------+-----+-----+--------------------+
|residue2|atom2|count|           frequency|
+--------+-----+-----+--------------------+
|     CYS|   SG|58943|   0.275788981167388|
|     HIS|  NE2|28150| 0.13171131126447538|
|     HOH|    O|14677| 0.06867235934027371|
|     HIS|  ND1|12242| 0.05727921394315125|
|     ASP|  OD2| 9515| 0.04451982688033688|
|     ASP|  OD1| 7722|0.036130541583810974|
|     GLU|  OE2| 6642| 0.03107731898467657|
|     GLU|  OE1| 6210| 0.02905602994502281|
+--------+-----+-----+--------------------+



## Print the top interacting elements

#### Exclude carbon interactions and group by element 2

In [14]:
topElements = interactions.filter("element2 != 'C'") \
                          .groupBy("element2") \
                          .count()

#### Add column with frequencey of occurence
#### Filter out occurence < 1%
#### sort decending

In [15]:
topElements.withColumn("frequency", topElements["count"] / n) \
           .filter("frequency > 0.01") \
           .sort("frequency", ascending = False) \
           .show(10)

+--------+-----+--------------------+
|element2|count|           frequency|
+--------+-----+--------------------+
|       S|59534|  0.2785542168674699|
|       O|56347| 0.26364253129020937|
|       N|44259|  0.2070838694584162|
|       H| 4488|0.020998947245291846|
+--------+-----+--------------------+



In [18]:
interactions.groupBy("element2") \
            .avg("distance") \
            .sort("avg(distance)") \
            .show(10)

+--------+-------------------+
|element2|      avg(distance)|
+--------+-------------------+
|      Co|0.06645968649536371|
|      Mn| 0.2096691057085991|
|      Ni| 0.8498532980680465|
|      Cu| 0.8818441693050166|
|      Mg| 1.0736593306064606|
|      Fe| 1.3130454624382157|
|      Na| 1.8720550934473674|
|       N|  2.156085003504156|
|       O|  2.276866027285148|
|      Cl| 2.3067678366861646|
+--------+-------------------+
only showing top 10 rows



## Aggregate multiple statistics

### NOTE: from pyspark.sql.functions import * required

In [20]:
interactions.groupBy("element2") \
            .agg(count("distance"), avg("distance"), min("distance"), max("distance"), kurtosis("distance")) \
            .show(10)

+--------+---------------+------------------+-------------+-------------+-------------------+
|element2|count(distance)|     avg(distance)|min(distance)|max(distance)| kurtosis(distance)|
+--------+---------------+------------------+-------------+-------------+-------------------+
|       K|              3|2.9527830282847085|    2.8704226|    2.9968925|-1.4999999999999991|
|      Ca|              1|2.8027758598327637|    2.8027759|    2.8027759|                NaN|
|      Fe|             12|1.3130454624382157|  0.013892444|    2.9888651|-1.8221410926403632|
|       F|             45|2.6406656159294974|    1.5429283|    2.9912868| 0.9162421515636483|
|      Ni|              5|0.8498532980680465|   0.14934523|     2.833429| 0.1082159795970754|
|       B|             22| 2.802124402739785|    2.5698645|    2.9966402|-0.8277527697958345|
|      Al|              2| 2.697709083557129|     2.542408|    2.8530102|-1.9999999999999993|
|      As|             26|2.8578427754915676|    2.5419223| 

## Terminate Spark

In [None]:
sc.stop()