# Load Packages

In [None]:
import numpy as np                                      # linear algebra
import pandas as pd                                     # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train = pd.read_csv("/kaggle/input/novozymes-enzyme-stability-prediction/train.csv")
test = pd.read_csv("/kaggle/input/novozymes-enzyme-stability-prediction/test.csv")
sample = pd.read_csv("/kaggle/input/novozymes-enzyme-stability-prediction/sample_submission.csv")
updates = pd.read_csv("/kaggle/input/novozymes-enzyme-stability-prediction/train_updates_20220929.csv")

In [None]:
print("train:",train.shape)
print("test:",test.shape)
print("updates:",updates.shape)

In [None]:
train["length_str"] = train['protein_sequence'].str.len()
train.describe()

#!# no todas las secuencias de proteinas tienen largo 221 o 220

# Functions

## Visualization functions

In [None]:
def visualization_train_df(dataframe):
    #sns.set(style="darkgrid")
    # creating a figure composed of two matplotlib.Axes objects (ax_box and ax_hist)
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
    # assigning a graph to each ax
    print("-"*10,"Boxplot TM and Histogram TM","-"*10)
    sns.boxplot(dataframe["tm"], ax=ax_box)
    sns.histplot(data=dataframe, x="tm", ax=ax_hist)
    
    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')
    plt.show()
    
    # histogram PH
    print("\n"+"-"*10,"Histograma PH","-"*10)
    dataframe.pH.plot(kind="hist", bins= 50)
    plt.show()
    
    # scatter plot TM - PH
    print("\n"+"-"*10,"SCATTER ph vs tm","-"*10)
    dataframe.plot.scatter(y="pH",x="tm")
    plt.show()
    
    # histogram lenght Protein Sequence
    print("\n"+"-"*10,"Histograma LEN protein sequence","-"*10)
    dataframe.length_str.plot(kind='hist',bins=50)
    plt.show()

## Update functions

In [None]:
def clean_train_df(train_df,train_updates):
    
    # Create the Flag
    train_updates["update_flag"] = np.where(train_updates["tm"].isnull(),"drop","swap")

    # Merge the Flag
    train_df_updated = train_df.merge(train_updates[["seq_id","update_flag"]], how = "left", on = "seq_id")

    # Drop Rows
    train_df_updated = train_df_updated[train_df_updated["update_flag"]!="drop"]

    # Swap Rows
    train_df_updated["tm_new"] = np.where(train_df_updated["update_flag"] == "swap",train_df_updated["pH"],train_df_updated["tm"])
    train_df_updated["pH_new"] = np.where(train_df_updated["update_flag"] == "swap",train_df_updated["tm"],train_df_updated["pH"])

    # Drop unwanted columns
    train_df_updated.drop(["update_flag","tm","pH"],axis = 1, inplace = True)

    # Rename Columns
    train_df_updated.rename(columns = {"pH_new" : "pH", "tm_new":"tm"},inplace=True)

    # Re-order the columns
    train_df_updated = train_df_updated[["seq_id","protein_sequence","pH","data_source","tm","length_str"]]

    return train_df_updated

# EDA

In [None]:
visualization_train_df(train)

In [None]:
train = clean_train_df(train,updates)
visualization_train_df(train)

In [None]:
uniques = len(train.protein_sequence.unique())
total   = len(train)
print(f"Unicos: {uniques} de un Total: {total}")
print(f"Diferencia: {total-uniques}")

In [None]:
#1606 secuencias proteinas duplicadas pero solo 64 de ellas tienen mismos PH y mismo TM
train[train[['protein_sequence','tm','pH']].duplicated()].describe()

# Split Data

In [None]:
def split_data(df):
    X_train, y_train, X_test, y_test = train_test_split(df, df.tm, test_size = 0.25, random_state = 123)
    return X_train, y_train, X_test, y_test

X_train, X_test , y_train, y_test = split_data(train)
X_train.drop('tm', axis=1, inplace=True)
X_test.drop('tm', axis=1, inplace=True)

# Extraction Features

In [None]:
def feature_extraction(df):
    df["len_protein"] = df['protein_sequence'].apply(lambda x: len(x))
    df["unique_protein"] = df['protein_sequence'].apply(set).apply(len)
    df["num_A"] = df.protein_sequence.str.count('A')
    df["num_M"] = df.protein_sequence.str.count('M')
    df["num_E"] = df.protein_sequence.str.count('E')
    df["num_N"] = df.protein_sequence.str.count('N')
    df["num_S"] = df.protein_sequence.str.count('S')
    df["num_V"] = df.protein_sequence.str.count('V')
    
    return df

X_train = feature_extraction(X_train)
X_test = feature_extraction(X_test)

In [None]:
#letra = "AAABC"
X_train

In [None]:
# Crear nuevo feature con el largo del dataframe
X_train = X_train.drop(["data_source", "protein_sequence","length_str"], axis =1)
X_test = X_test.drop(["data_source", "protein_sequence","length_str"], axis =1)

X_train.head(1)

In [None]:
#from sklearn import preprocessing
#lbl = preprocessing.OrdinalEncoder()
#train['protein_sequence'] = lbl.fit_transform(train['protein_sequence'].astype(str))
#test['protein_sequence'] = lbl.fit_transform(test['protein_sequence'].astype(str))

In [None]:
dtrain = xgb.DMatrix(X_train, 
                     label = y_train, )
dtest = xgb.DMatrix(X_test)
# specify parameters via map
param = {'max_depth':15, 'eta':1, 'objective' :'reg:squarederror' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)

In [None]:
test = feature_extraction(test)
test = test.drop(["protein_sequence", "data_source"], axis =1)
test.head()

In [None]:
last_test = xgb.DMatrix(test)
preds = bst.predict(last_test)

In [None]:
sample["tm"] = preds

In [None]:
sample.tm.plot(kind ="hist", bins =100)

In [None]:
sample.to_csv("submission.csv", index = False)

In [None]:
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))