# **1.Data loading**

## Imports

In [1]:
from os import path, sep, pardir, getcwd, listdir
from typing import AnyStr, List, Dict, Set, Any

from pandas import DataFrame, read_csv, concat

## Data loading class

*(Completetly unnecessary and a bite chaotic)*

In [2]:
class LoadedData:
    """Class storing the train and test data"""

    @staticmethod
    def _get_loading_paths(searched_tags: Set[AnyStr] = ('train', 'test', 'train_updates')) -> Dict[AnyStr, AnyStr]:
        """Function returns the loading paths for the relevant data"""


        def _join_list_to_str(*args: List[Any]) -> str:
            """Function joins the first list arguments"""
            str_var = ""

            for arg in args:
                if arg != args[-1]:
                    str_var += arg[0] + '\\'

                else:
                    str_var += arg[0]

            return str_var


        def _check_list(list_var: List[AnyStr]) -> bool:
            """Function checks the given list length"""
            return True if len(list_var) == 1 else False


        def _get_working_directory(str_var: AnyStr = "") -> AnyStr:
            """Function returns the path to data in the working directory"""
            return str(path.normpath(getcwd() + sep + pardir)) + f'\\{str_var}'


        def _get_all_subdirectories(path_str: AnyStr = "") -> List[AnyStr]:
            """Function returns all the possible subdirectories"""
            if path_str:
                return listdir(path_str)

            else:
                return listdir(_get_working_directory())

        docs_subdir_list = [subdir for subdir in _get_all_subdirectories() if subdir.lower().find('docs') >= 0]

        if _check_list(docs_subdir_list):
            docs_path = _get_working_directory(_join_list_to_str(docs_subdir_list))
            data_subdir_list = [subdir for subdir in _get_all_subdirectories(docs_path)
                                if subdir.lower().find('data') >= 0]

            if _check_list(data_subdir_list):
                data_path = _get_working_directory(_join_list_to_str(docs_subdir_list, data_subdir_list))
                tag_dict = {}

                for tag in searched_tags:
                    tag_subdir_list = [subdir for subdir in _get_all_subdirectories(data_path)
                                       if subdir.lower().find(f'{tag}') >= 0]

                    if len(tag_subdir_list) > 0:
                        tag_path = _get_working_directory(_join_list_to_str(docs_subdir_list,
                                                                            data_subdir_list,
                                                                            tag_subdir_list))
                        tag_dict[f'{tag}_df'] = tag_path

                return tag_dict

            else:
                raise FileExistsError('No or to much of data directory in the docs')

        else:
            raise FileExistsError('No or to much of docs directory in the current working directory')

    @staticmethod
    def _get_corrected_train_data(train_df: DataFrame, update_df: DataFrame) -> DataFrame:
        """Function returns the corrected train data with the last update file"""
        update_df.data_source = train_df.loc[update_df.index, 'data_source']
        update_indexes = update_df[update_df.protein_sequence.isna()].index
        update_df.loc[update_indexes, 'protein_sequence'] = train_df.loc[update_indexes, 'protein_sequence']
        train_df.drop(update_df.index, inplace=True)
        return concat([train_df, update_df])


    def __init__(self, initialzation: bool = True) -> None:
        if initialzation is True:
            self.loading_paths_dict = self._get_loading_paths()
            self.test_df = read_csv(self.loading_paths_dict['test_df'], index_col="seq_id")
            self.train_df = self._get_corrected_train_data(
                read_csv(self.loading_paths_dict['train_df'], index_col="seq_id"),
                read_csv(self.loading_paths_dict['train_updates_df'], index_col="seq_id")
            )

## Class calling

In [3]:
Loaded_data = LoadedData()

## Verification of loading

### Train dataframe

In [4]:
Loaded_data.train_df.head()

Unnamed: 0_level_0,protein_sequence,pH,data_source,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


### Test dataframe

In [5]:
Loaded_data.test_df.head()

Unnamed: 0_level_0,protein_sequence,pH,data_source
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes


# **Data cleaning**

## Data cleaning class

In [6]:
class CleanedData(LoadedData):
    """Class for cleaning and storing the cleaned data"""

    @staticmethod
    def _remove_empty_rows(df: DataFrame, col_name: str) -> None:
        df.drop(df[df[col_name].isna()].index, inplace=True)


    def __init__(self, target_column: str = 'tm') -> None:
        super(CleanedData, self).__init__()
        # Removal of the irrelevant data
        for col_name in self.test_df.columns.to_list():
            if self.train_df[col_name].isin(self.test_df[col_name].unique()).sum() == 0\
                    and col_name != 'protein_sequence':
                if 'cleaned_train_data' not in locals():
                    self.cleaned_train_data = self.train_df.drop(col_name, axis=1, )
                    self.cleaned_test_data = self.test_df.drop(col_name, axis=1)

                else:
                    self.cleaned_train_data.drop(col_name, axis=1, inplace=True)
                    self.cleaned_test_data.drop(col_name, axis=1, inplace=True)

        self._remove_empty_rows(self.cleaned_train_data, target_column)

        del self.test_df, self.train_df

## Class initialization

In [7]:
Cleaned_data = CleanedData()

## Verification of loading

### Train data

In [8]:
Cleaned_data.cleaned_train_data.head()

Unnamed: 0_level_0,protein_sequence,pH,tm
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,75.7
1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,50.5
2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,40.5
3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,47.2
4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,49.5


In [9]:
Cleaned_data.cleaned_train_data.shape

(28981, 3)

### Test data

In [10]:
Cleaned_data.cleaned_test_data.head()

Unnamed: 0_level_0,protein_sequence,pH
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8


In [11]:
Cleaned_data.cleaned_test_data.shape

(2413, 2)

# **Feature engineering**

## Necessary imports

In [37]:
from protlearn.features import aac, entropy, aaindex1, atc
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, Normalizer, LabelEncoder
from typing import List, AnyStr, Any
from numpy import ndarray, array, concatenate, expand_dims, nanmean, where, isnan, take, unique, argsort, split

## Feature engineering class

In [15]:
class FeatureData(CleanedData):

    __ami_symbols = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    __seq_col = 'protein_sequence'
    __target_column = 'tm'

    @staticmethod
    def _ftn1_fill_na_mean(df: DataFrame, *args: Any, col_name: AnyStr = 'pH') -> None:
        df[col_name].fillna(float(df[col_name].mean()), inplace=True)

    @staticmethod
    def _ftn2_convert_num_columns(df: DataFrame, col_names: List[AnyStr], *args: Any) -> ndarray:
        if len(col_names) > 0:
            return df[col_names].to_numpy()

    @staticmethod
    def _ftc1_get_seq_length(df: DataFrame, *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        return expand_dims(df[col_name].map(lambda x: len(x)).to_numpy(), 1)

    @staticmethod
    def _ftc2_get_count_symb(df: DataFrame, *args: Any, col_name: AnyStr = __seq_col,
                             amino_res_list: List[AnyStr] = __ami_symbols) -> ndarray:
        return array([df[col_name].map(lambda x: x.count(amino_symbol)).tolist() for amino_symbol in amino_res_list]).T

    @staticmethod
    def _ftc3_get_aac(df: DataFrame, *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        aac_list, aac_base = aac(df[col_name].to_list(), method='relative')
        return aac_list if len(aac_base) > 0 else ValueError('Protein sequance base is zero')

    @staticmethod
    def _ftc4_get_atc_atom_arr(df: DataFrame,  *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        atc_arr, bonds_arr = atc(df[col_name].to_list(), method='relative')
        return atc_arr if atc_arr is not None else ValueError('Protein sequance base is zero')

    @staticmethod
    def _ftc4_get_atc_bonds_arr(df: DataFrame,  *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        atc_arr, bonds_arr = atc(df[col_name].to_list(), method='relative')
        return bonds_arr if bonds_arr is not None else ValueError('Protein sequance base is zero')

    @staticmethod
    def _ftc5_get_aaindex(df: DataFrame,  *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        aaindex_arr, inds = aaindex1(df[col_name].to_list())
        return aaindex_arr if inds is not None else ValueError('Protein sequance base is zero')

    @staticmethod
    def _ftc6_get_entropy(df: DataFrame, *args: Any, col_name: AnyStr = __seq_col) -> ndarray:
        return entropy(df[col_name].to_list(), standardize='none')

    def _join_arr(self, func_list: List[AnyStr], df: DataFrame, *args: Any) -> ndarray:
        func_val_list = []

        for function in func_list:
            funct_value = getattr(self, function)(df, self.num_cols_list)
            if funct_value is not None:
                func_val_list.append(funct_value)

        return concatenate(func_val_list, axis=1)


    def __init__(self):
        super(FeatureData, self).__init__()

        self.data_dict = {'original_train_indices': self.cleaned_train_data.index.tolist(),
                          'original_test_indices': self.cleaned_test_data.index.tolist()}
        self.cleaned_train_data.reset_index(inplace=True)
        self.cleaned_test_data.reset_index(inplace=True)
        self.data_dict['new_train_indices'] = self.cleaned_train_data.index
        self.data_dict['new_test_indices'] = self.cleaned_test_data.index
        self.data_dict['train_y'] = self.cleaned_train_data[self.__target_column]
        self.cleaned_train_data.drop(self.__target_column, axis=1, inplace=True)
        self.data_dict['combined_data'] = concat([self.cleaned_train_data, self.cleaned_test_data])

        self.cat_col_list = [col_name for col_name in self.cleaned_train_data.columns
                             if self.cleaned_train_data[col_name].nunique() < 10
                             and self.cleaned_train_data[col_name].dtype == "object"]

        self.num_cols_list = [col_name for col_name in self.cleaned_train_data.columns
                              if self.cleaned_train_data[col_name].dtype in ['int64', 'float64']
                              and col_name.find('id') == -1]

        self.__static_method_list = []

        for method in dir(self):
            if callable(getattr(self, method)) and method.startswith('_ft'):
                self.__static_method_list.append(method)

        self.data_dict['featured_data'] = self._join_arr(self.__static_method_list, self.data_dict['combined_data'])
        self.data_dict['train_X'] = self.data_dict['featured_data'][self.data_dict['new_train_indices'], :]
        self.data_dict['test_X'] = self.data_dict['featured_data'][self.data_dict['new_test_indices'], :]
        self.data_dict.pop('combined_data')
        self.data_dict.pop('featured_data')
        self.data_dict.pop('new_train_indices')
        self.data_dict.pop('new_test_indices')

## Class initialization

In [16]:
featured_data = FeatureData()

## Verification of feature addition

In [20]:
featured_data.data_dict['train_X'][:10]

array([[341.        ,  45.        ,   1.        , ...,   3.13219941,
          3.88601425,   7.        ],
       [286.        ,  28.        ,   0.        , ...,   2.85120629,
          3.83666584,   7.        ],
       [497.        ,  50.        ,   9.        , ...,   2.9844165 ,
          4.07601297,   7.        ],
       ...,
       [301.        ,  15.        ,  16.        , ...,   3.55916611,
          4.13747432,   7.        ],
       [287.        ,  41.        ,   1.        , ...,   2.93788153,
          3.93796834,   7.        ],
       [163.        ,  14.        ,   2.        , ...,   3.55817178,
          4.06176195,   7.        ]])

In [21]:
featured_data.data_dict['train_X'].shape

(28981, 604)

In [25]:
x = featured_data.data_dict['train_X'][:, -1]

In [26]:
x

array([7. , 7. , 7. , ..., 2.7, 2.7, 7. ])

In [30]:
u_x = unique(x)

In [32]:
a = featured_data.data_dict['train_X'][featured_data.data_dict['train_X'][:, -1].argsort()]

In [35]:
a[:10]

array([[164.        ,  15.        ,   2.        , ...,   3.00312195,
          4.10894552,   1.99      ],
       [164.        ,  14.        ,   2.        , ...,   3.04633537,
          4.10354264,   1.99      ],
       [164.        ,  15.        ,   2.        , ...,   2.99373171,
          4.09901965,   2.        ],
       ...,
       [164.        ,  15.        ,   2.        , ...,   2.98427439,
          4.1016811 ,   2.        ],
       [164.        ,  15.        ,   2.        , ...,   2.99126829,
          4.09982048,   2.        ],
       [164.        ,  15.        ,   3.        , ...,   3.00975   ,
          4.11250084,   2.        ]])

In [38]:
split(a[:, 0], unique(a[:, -1], return_index=True)[1][1:])

[array([164., 164.]),
 array([ 164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  104.,  100.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164., 2477.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  164., 2477.,  164.,  164.,  164.,  164.,
         100.,  100.,  164.,  105.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,  164.,
         164.,  164.,  100.,  100.,  100.,  164.,  164.,  164.,  100.,
         164.,  164.,  164.,  164.,  100.,  164.,  164.,  164.,  100.,
         164.]),
 array([164., 164.]),
 array([164., 164., 164.]),
 array([164., 164., 164., 109., 109., 109., 109., 109.]),
 array([148., 148., 148., 148., 164., 148., 164., 148.]),
 array([148., 

# Data preprocessing

## Necessary imports

## Data preprocessing class

In [22]:
class PreprocessedData(FeatureData):

    def __init__(self):
        super(PreprocessedData, self).__init__()

## Class initialization

In [23]:
preprocessed_data = PreprocessedData()

## Verification of preprocessing

In [24]:
preprocessed_data.data_dict['train_X'][:10]

array([[341.        ,  45.        ,   1.        , ...,   3.13219941,
          3.88601425,   7.        ],
       [286.        ,  28.        ,   0.        , ...,   2.85120629,
          3.83666584,   7.        ],
       [497.        ,  50.        ,   9.        , ...,   2.9844165 ,
          4.07601297,   7.        ],
       ...,
       [301.        ,  15.        ,  16.        , ...,   3.55916611,
          4.13747432,   7.        ],
       [287.        ,  41.        ,   1.        , ...,   2.93788153,
          3.93796834,   7.        ],
       [163.        ,  14.        ,   2.        , ...,   3.55817178,
          4.06176195,   7.        ]])

# Model training

## Necessary imports

## Data preprocessing class

## Class initialization