# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.express as px
from biopandas.pdb import PandasPdb as ppdb

In [None]:
TRAIN_PATH = "/kaggle/input/novozymes-enzyme-stability-prediction/train.csv"
TEST_PATH = "/kaggle/input/novozymes-enzyme-stability-prediction/test.csv"
SUB_PATH = "/kaggle/input/novozymes-enzyme-stability-prediction/sample_submission.csv"

PDB_PATH = "/kaggle/input/novozymes-enzyme-stability-prediction/wildtype_structure_prediction_af2.pdb"


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test_csv')
sub_df = pd.read_csv('sample_submission.csv')

In [None]:
pdb_df = ppdb().read_pdb('wildtype_structure_prediction_af2.pdb')

In [None]:
type(pdb_df.df)

In [None]:
pdb_df.df.keys()

In [None]:
pdb_df.df.items()

# DF - ATOM

In [None]:
atom_df = pdb_df.df["ATOM"]
atom_df.head()

In [None]:
atom_df.info()

In [None]:
atom_df.describe().T

In [None]:
plt.title("The Distribution of Coordinates")

sns.kdeplot(atom_df.x_coord, fill=True)
sns.kdeplot(atom_df.y_coord, fill=True)
sns.kdeplot(atom_df.z_coord, fill=True)

plt.xlabel("Coordinates")
plt.legend(["x_coord", "y_coord", "z_coord"])

plt.show()

In [None]:
sns.kdeplot(atom_df.b_factor, fill=True)
plt.show()

In [None]:
sns.kdeplot(atom_df.atom_number, fill=True)
plt.show()

In [None]:
sns.countplot(atom_df.element_symbol)
plt.show()

In [None]:
plt.figure(figsize=(15,18))

sns.countplot(atom_df.atom_name)

plt.xlabel("Atom Name", fontsize=16)
plt.xticks(rotation=45, fontsize=14)
plt.show()

In [None]:
hist_data =[atom_df["x_coord"].values]
group_labels = ['x_coord'] 

fig = ff.create_distplot(hist_data, group_labels)

fig.update_layout(title_text='x_coord Distribution Plot')

fig.show()

In [None]:
hist_data =[atom_df["y_coord"].values]
group_labels = ['y_coord'] 

fig = ff.create_distplot(hist_data, group_labels)

fig.update_layout(title_text='y_coord Distribution Plot')

fig.show()

In [None]:
hist_data =[atom_df["z_coord"].values]
group_labels = ['z_coord'] 

fig = ff.create_distplot(hist_data, group_labels)

fig.update_layout(title_text='z_coord Distribution Plot')

fig.show()

In [None]:
hist_data =[atom_df["b_factor"].values]
group_labels = ['b_factor'] 

fig = ff.create_distplot(hist_data, group_labels)

fig.update_layout(title_text='b_factor Distribution Plot')

fig.show()

In [None]:
fig = px.box(atom_df, x='element_symbol', y='b_factor', points="all")
fig.update_layout(
    title_text="b-factor wise element-symbol Spread")
fig.show()

In [None]:
fig = px.box(atom_df, x='atom_name', y='b_factor', points="all")
fig.update_layout(
    title_text="b-factor wise atom_name Spread")
fig.show()

In [None]:
# examine correlation
sns.set(font_scale=1.4)

cmap = sns.diverging_palette(2, 165, s=80, l=55, n=9)
corrmat = atom_df.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corrmat,cmap= cmap,annot=True, square=True)

# HETAM

In [None]:
heatm_df = pdb_df.df["HETATM"]
heatm_df.head()

# DF - ANISOU

In [None]:
anisou_df = pdb_df.df["ANISOU"]
anisou_df.head()

# DF - OTHERS

In [None]:
others_df = pdb_df.df["OTHERS"]
others_df.head()

# 3D Visualization - Proteins

In [None]:
%%capture

!pip install prody

In [None]:
from prody import parsePDB, showProtein, showContactMap, calcPhi

In [None]:
pdbs = parsePDB(PDB_PATH)
pdbs

In [None]:
print("Protein Sequence: \n".upper())
pdbs.getSequence()

In [None]:
showProtein(pdbs)

In [None]:
showContactMap(pdbs.ca)

In [None]:
sns.kdeplot(pdbs.getMasses(), fill=True)
plt.xlabel("Masses");

In [None]:
sns.kdeplot(pdbs.getResnums(), fill=True)
plt.xlabel("Res Nums");

In [None]:
plt.figure(figsize=(12,12))
sns.countplot(pdbs.getResnames());

In [None]:
#Returns φ (phi) angle of residue in degrees.
#This function checks the distance between
#Cα atoms of two residues and raises an exception
#if the residues are disconnected.

calcPhi(pdbs['A', 10])

In [None]:
sns.kdeplot(pdbs.getBetas(), fill=True)