In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install Levenshtein

In [None]:
!unzip /content/drive/MyDrive/work/projects/Novozymes/datasets/termonet_test_features.npy.zip

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from scipy.stats import spearmanr
import numpy as np
from matplotlib import pyplot as plt
from lightgbm import plot_importance, plot_tree
import Levenshtein
from scipy.stats import rankdata


In [None]:
class CFG:
  TRAIN_FEATURES = '/content/drive/MyDrive/work/projects/Novozymes/datasets/termonet_train_features.pkl'
  TEST_CSV = '/content/drive/MyDrive/work/projects/Novozymes/data/test.csv'
  TEST_FEATURES = '/content/nesp_features.npy'


In [None]:
train_df = pd.read_pickle(CFG.TRAIN_FEATURES)
train_df = train_df[~train_df.ddG.isnull() & ~train_df.pH.isnull()]
len(train_df)


In [None]:
X = []
Y = []
feature_names = []
for i in range(14):
    feature_names.append(f'{i+1}')
feature_names.append('pH')
feature_names.append('wildtype')
feature_names.append('mutant')
for i, r in train_df.iterrows():
  features = []
  for f in r.features:
    features.append(f)
  features.append(r.pH)
  features.append(ord(r.wildtype))
  features.append(ord(r.mutant))
  X.append(features)
  Y.append(r.ddG)

print(len(X), len(Y))


In [None]:
lgbm_model = LGBMRegressor(importance_type = 'gain')
lgbm_model.fit(X,Y,feature_name=feature_names)
lgbm_model.score(X,Y)

In [None]:
P = lgbm_model.predict(X)
print(spearmanr(P,Y))
plt.scatter(P,Y)

In [None]:
plot_importance(lgbm_model)


In [None]:
plot_tree(lgbm_model, figsize=(40,40))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, train_size=0.80, test_size=0.20, random_state=1)

# lr, estim, metric
lgbm_model = LGBMRegressor(boosting_type='gbdt',  # gbdt
                           num_leaves=31,         # 31
                           max_depth=-1,          # -1
                           learning_rate=0.2,    # 0.1
                           n_estimators=300,      # 100
                           objective='regression' # regression lambdarank
)

lgbm_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid),(X_train, y_train)], feature_name=feature_names, categorical_feature=['wildtype','mutant'])
#lgbm_model.fit(X, Y, feature_name=feature_names)

print('train score:', lgbm_model.score(X_train, y_train))
print('valid score:', lgbm_model.score(X_valid, y_valid))
P = lgbm_model.predict(X_valid)
print(spearmanr(P,y_valid))
#lgbm_model.fit(X,Y,feature_name=feature_names)

In [None]:
P = lgbm_model.predict(X_valid)
print(spearmanr(P,y_valid))

In [None]:
def gen_mutations(name, df,
                  wild="VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQ""RVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGT""NAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKAL""GSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK"):
    result = []
    for _, r in df.iterrows():
        ops = Levenshtein.editops(wild, r.protein_sequence)
        assert len(ops) <= 1
        if len(ops) > 0 and ops[0][0] == 'replace':
            idx = ops[0][1]
            result.append([ops[0][0], idx + 1, wild[idx], r.protein_sequence[idx]])
        elif len(ops) == 0:
            result.append(['same', 0, '', ''])
        elif ops[0][0] == 'insert':
            assert False, "Ups"
        elif ops[0][0] == 'delete':
            idx = ops[0][1]
            result.append(['delete', idx + 1, wild[idx], '-'])
        else:
            assert False, "Ups"

    df = pd.concat([df, pd.DataFrame(data=result, columns=['op', 'idx', 'wild', 'mutant'])], axis=1)
    df['mut'] = df[['wild', 'idx', 'mutant']].astype(str).apply(lambda v: ''.join(v), axis=1)
    df['name'] = name
    return df

df_test = gen_mutations('wildtypeA', pd.read_csv(CFG.TEST_CSV))

df_test

In [None]:
df_test_repl = df_test.loc[df_test.op == 'replace']

X_T = []
for i, r in df_test_repl.iterrows():
  features = []
  features.append(8.0)
  features.append(ord(r.wild))
  features.append(ord(r.mutant))
  X_T.append(features)


print(len(X_T))

In [None]:
test_f = np.load(CFG.TEST_FEATURES)
test_f = test_f.mean(axis=(2,3,4))
test_f = np.concatenate([test_f,X_T], axis=1)
test_f.shape

In [None]:
test_f[:3]

In [None]:
test_y = lgbm_model.predict(test_f)

In [None]:
# replacement mutations
df_test.loc[df_test.op == 'replace', 'ddg'] = test_y
# deletion mutations
df_test.loc[df_test['op'] == "delete", 'ddg'] = df_test[df_test["op"]=="replace"]["ddg"].quantile(q=0.25)
# no mutations
df_test.loc[df_test['op'] == "same", 'ddg'] = 0.  

df_test.rename(columns={'ddg': 'tm'})[['seq_id', 'tm']].to_csv('submission.csv', index=False)
!head submission.csv

In [None]:
import Bio.PDB

In [None]:
!pip install BioPython 

In [None]:
pdbparser = Bio.PDB.PDBParser(QUIET=True)   # suppress PDBConstructionWarning
struct = pdbparser.get_structure('wildtypeA', '/content/wildtypeA/wildtypeA_relaxed_F164L_relaxed.pdb')

In [None]:
struct

In [None]:
!unzip /content/drive/MyDrive/work/projects/Novozymes/data/termonet/wildtypeA.zip 

In [None]:
!head '/content/wildtypeA/wildtypeA_relaxed_F164L_relaxed.pdb'

In [None]:
#BEGIN_POSE_ENERGIES_TABLE
from io import StringIO
import pandas as pd

def parse_score(pdb_file):
  f = open(pdb_file, 'r')
  lines = f.readlines()
  start = False
  for i,line in enumerate(lines):
    if line.startswith('#BEGIN_POSE_ENERGIES_TABLE'):
      break
  lines = lines[i+1:-2]
  table_data = ''
  for line in lines:
    table_data += line

  df = pd.read_csv(StringIO(table_data), sep=' ')
  return df

df = parse_score('/content/wildtypeA/wildtypeA_relaxed_F164L_relaxed.pdb')
  

In [None]:
df.loc[1,'total']