In [1]:
import pandas as pd

In [18]:
def make_mutation(sequence, mutation):
    wild_residue = mutation[0]
    mutation_residue = mutation[-1]
    position = int(mutation[1:-1])
       
    sequence = [residue for residue in sequence]
    if sequence[position-1] == wild_residue:
        sequence[position-1] = mutation_residue
        sequence = "".join(map(str, sequence))
        return True, sequence
    else:
        return False, []

In [22]:
def produce_df_sequences(df_mutations, df_sequences):
    matrix_data = []
    matrix_data_error = []

    for index in df_mutations.index:
        variation = df_mutations["variation"][index]
        gi = df_mutations["gi"][index]
        response = df_mutations["solubility"][index]

        filter_data = df_sequences[df_sequences["gi"] == gi]
        if len(filter_data)>0:
            filter_data.reset_index(inplace=True)
            sequence = filter_data["seq"][0]
            response_check_seq = make_mutation(sequence, variation)
            if response_check_seq[0]:
                row = [
                    gi,
                    variation,
                    sequence,
                    response
                ]
                matrix_data.append(row)
            else:
                matrix_data_error.append([gi, variation, "Not Found mutation"])
        else:
            matrix_data_error.append([gi, variation, "Not Found gi"])

    df_process_sequences = pd.DataFrame(data=matrix_data, columns=["gi", "variation", "sequence", "response"])
    df_process_sequences_error = pd.DataFrame(data=matrix_data_error, columns=["gi", "variation", "type_error"])

    return df_process_sequences, df_process_sequences_error


In [2]:
df_sequences = pd.read_csv("../../raw_data/PON-Sol2/all_protein_sequences.csv")
df_sequences.head(5)

Unnamed: 0,num,gi,name,seq
0,0,109731493,>AAI14500.1 Tryptophan hydroxylase 2 [Homo sap...,MQPAMMMFSSKYWARRGFSLDSAVPEEHQLLGSSTLNKPNSGKNDD...
1,1,11133565,>sp|Q9NPJ1.1|MKKS_HUMAN RecName: Full=McKusick...,MSRLEAKKPSLCKSEPLTTERVRTTLSVLKRIVTSCYGPSGRLKQL...
2,2,116242717,>sp|Q13563.3|PKD2_HUMAN RecName: Full=Polycyst...,MVNSSRVQPQQPGDAKRPPAPRAPDPGRLMAGCAAVGASLAAPGGL...
3,3,116666949,">pdb|2CY8|A Chain A, D-phenylglycine Aminotran...",MSILNDYKRKTEGSVFWAQRARSVMPDGVTADTRAFDPHGLFISDA...
4,4,117391,>sp|P12271.2|RLBP1_HUMAN RecName: Full=Retinal...,MSEGVGTFRMVPEEEQELRAQLEQLTTKDHGPVFGPCSQLPRHTLQ...


In [3]:
df_train = pd.read_csv("../../raw_data/PON-Sol2/train_dataset.csv")
df_train.head(5)

Unnamed: 0,index,variation,solubility,gi
0,0,M226K,-1,117391
1,1,R151Q,-1,117391
2,2,R234W,0,117391
3,3,P193F,-1,117544
4,4,P193G,-1,117544


In [6]:
df_train.shape

(5666, 4)

In [4]:
df_test1 = pd.read_csv("../../raw_data/PON-Sol2/test1_dataset.csv")
df_test1.head(5)

Unnamed: 0,index,variation,solubility,gi
0,25,K113F,0,145563
1,26,W130F,-1,145563
2,27,W140C,-1,145563
3,28,W140F,1,145563
4,29,W140K,-1,145563


In [8]:
df_test1.shape

(46, 4)

In [5]:
df_test2 = pd.read_csv("../../raw_data/PON-Sol2/test2_dataset.csv")
df_test2.head(5)

Unnamed: 0,index,variation,solubility,gi
0,34,L165P,-1,299464
1,40,S302N,1,633678
2,77,K176P,1,1708279
3,82,R49C,-1,1732063
4,85,M63K,0,2506127


In [9]:
df_test2.shape

(662, 4)

In [23]:
df_process_sequences_train, df_process_sequences_error_train = produce_df_sequences(df_train, df_sequences)
df_process_sequences_test1, df_process_sequences_error_test1 = produce_df_sequences(df_test1, df_sequences)
df_process_sequences_test2, df_process_sequences_error_test2 = produce_df_sequences(df_test2, df_sequences)


In [24]:
df_process_sequences_error_train.shape

(0, 3)

In [25]:
df_process_sequences_error_test1.shape

(0, 3)

In [26]:
df_process_sequences_error_test2.shape

(0, 3)

In [27]:
df_concat = pd.concat([df_process_sequences_train, df_process_sequences_test1, df_process_sequences_test2], axis=0)
df_concat.shape

(6374, 4)

In [28]:
df_concat.head(5)

Unnamed: 0,gi,variation,sequence,response
0,117391,M226K,MSEGVGTFRMVPEEEQELRAQLEQLTTKDHGPVFGPCSQLPRHTLQ...,-1
1,117391,R151Q,MSEGVGTFRMVPEEEQELRAQLEQLTTKDHGPVFGPCSQLPRHTLQ...,-1
2,117391,R234W,MSEGVGTFRMVPEEEQELRAQLEQLTTKDHGPVFGPCSQLPRHTLQ...,0
3,117544,P193F,MNPYQNKNEYETLNASQKKLNISNNYTRYPIENSPKQLLQSTNYKD...,-1
4,117544,P193G,MNPYQNKNEYETLNASQKKLNISNNYTRYPIENSPKQLLQSTNYKD...,-1


In [29]:
df_concat.to_csv("../../process_data/mutations_categorical/PON-Sol2_export_data.csv", index=False)