In [1]:
import pandas as pd

In [41]:
def make_mutation(sequence, mutation):
    wild_residue = mutation[0]
    mutation_residue = mutation[-1]
    position = int(mutation[1:-1])
       
    sequence = [residue for residue in sequence]
    if sequence[position-1] == wild_residue:
        sequence[position-1] = mutation_residue
        sequence = "".join(map(str, sequence))
        return True, sequence
    elif sequence[position-1] == mutation_residue:
        sequence[position-1] = wild_residue
        sequence = "".join(map(str, sequence))
        return True, sequence
    else:
        return False, []

In [22]:
def produce_df_sequences(df_mutations, df_sequences):
    matrix_data = []
    matrix_data_error = []

    for index in df_mutations.index:
        variation = df_mutations["variation"][index]
        gi = df_mutations["gi"][index]
        response = df_mutations["solubility"][index]
        
        filter_data = df_sequences[df_sequences["gi"] == gi]
        if len(filter_data)>0:
            filter_data.reset_index(inplace=True)
            sequence = filter_data["seq"][0]
            response_check_seq = make_mutation(sequence, variation)
            if response_check_seq[0]:
                row = [
                    gi,
                    variation,
                    sequence,
                    response
                ]
                matrix_data.append(row)
            else:
                matrix_data_error.append([gi, variation, "Not Found mutation"])
        else:
            matrix_data_error.append([gi, variation, "Not Found gi"])

    df_process_sequences = pd.DataFrame(data=matrix_data, columns=["gi", "variation", "sequence", "response"])
    df_process_sequences_error = pd.DataFrame(data=matrix_data_error, columns=["gi", "variation", "type_error"])

    return df_process_sequences, df_process_sequences_error

In [12]:
df_sequences = pd.read_excel("../../raw_data/DeepMutSol/all_protein_sequences.xlsx")
df_sequences.head(5)

Unnamed: 0,num,gi,name,seq
0,0,109731493,>AAI14500.1 Tryptophan hydroxylase 2 [Homo sap...,MQPAMMMFSSKYWARRGFSLDSAVPEEHQLLGSSTLNKPNSGKNDD...
1,1,11133565,>sp|Q9NPJ1.1|MKKS_HUMAN RecName: Full=McKusick...,MSRLEAKKPSLCKSEPLTTERVRTTLSVLKRIVTSCYGPSGRLKQL...
2,2,116242717,>sp|Q13563.3|PKD2_HUMAN RecName: Full=Polycyst...,MVNSSRVQPQQPGDAKRPPAPRAPDPGRLMAGCAAVGASLAAPGGL...
3,3,116666949,">pdb|2CY8|A Chain A, D-phenylglycine Aminotran...",MSILNDYKRKTEGSVFWAQRARSVMPDGVTADTRAFDPHGLFISDA...
4,4,117391,>sp|P12271.2|RLBP1_HUMAN RecName: Full=Retinal...,MSEGVGTFRMVPEEEQELRAQLEQLTTKDHGPVFGPCSQLPRHTLQ...


In [13]:
df_sequences.shape

(77, 4)

In [2]:
df_test = pd.read_excel("../../raw_data/DeepMutSol/test_dataset.xlsx")
df_test.head(5)

Unnamed: 0,index,variation,solubility,gi,mut_residue,Unnamed: 5
0,34,L165P,-1,299464,,pos
1,40,S302N,1,633678,,pos
2,77,K176P,1,1708279,,pos
3,82,R49C,-1,1732063,,pos
4,85,M63K,0,2506127,,pos


In [3]:
df_train = pd.read_excel("../../raw_data/DeepMutSol/train_dataset.xlsx")
df_train.head(5)

Unnamed: 0,index,variation,solubility,gi,mut_residue,Unnamed: 5
0,6,P194G,0,117544,194,pos
1,35,L250P,-1,299464,250,pos
2,58,H103A,0,1708279,103,pos
3,60,H103D,0,1708279,103,pos
4,66,H103L,1,1708279,103,pos


In [4]:
df_val = pd.read_excel("../../raw_data/DeepMutSol/val_dataset.xlsx")
df_val.head(5)

Unnamed: 0,index,variation,solubility,gi,mut_residue,Unnamed: 5
0,1,R151Q,-1,117391,151,pos
1,2,R234W,0,117391,234,pos
2,3,P193F,-1,117544,193,pos
3,10,F602S,1,121069,602,pos
4,13,M51K,0,127455,51,pos


In [5]:
df_concat = pd.concat([df_train, df_test, df_val], axis=0)
df_concat.head(5)

Unnamed: 0,index,variation,solubility,gi,mut_residue,Unnamed: 5
0,6,P194G,0,117544,194.0,pos
1,35,L250P,-1,299464,250.0,pos
2,58,H103A,0,1708279,103.0,pos
3,60,H103D,0,1708279,103.0,pos
4,66,H103L,1,1708279,103.0,pos


In [6]:
df_concat.shape

(11414, 6)

In [7]:
df_ponsol2 = pd.read_csv("../../process_data/mutations_categorical/PON-Sol2_export_data.csv")
df_ponsol2.shape

(6374, 4)

In [8]:
df_concat["is_in_ponsol2"] = df_concat["gi"].isin(df_ponsol2["gi"].tolist())

In [17]:
df_concat.dtypes

index                  int64
variation             object
solubility             int64
gi                     int64
mut_residue          float64
Unnamed: 5            object
is_in_ponsol2           bool
is_in_ponsol2_mut       bool
dtype: object

In [18]:
df_sequences.dtypes

num      int64
gi       int64
name    object
seq     object
dtype: object

In [9]:
df_concat["is_in_ponsol2"].value_counts()

True    11414
Name: is_in_ponsol2, dtype: int64

In [10]:
df_concat["is_in_ponsol2_mut"] = df_concat["variation"].isin(df_ponsol2["variation"].tolist())

In [11]:
df_concat["is_in_ponsol2_mut"].value_counts()

True     6421
False    4993
Name: is_in_ponsol2_mut, dtype: int64

In [42]:
df_process_sequences_train, df_process_sequences_error_train = produce_df_sequences(df_train, df_sequences)
df_process_sequences_test1, df_process_sequences_error_test1 = produce_df_sequences(df_test, df_sequences)
df_process_sequences_test2, df_process_sequences_error_test2 = produce_df_sequences(df_val, df_sequences)


In [43]:
df_process_sequences_error_train.shape

(0, 3)

In [44]:
df_process_sequences_error_test1.shape

(0, 3)

In [45]:
df_process_sequences_error_test2.shape

(0, 3)

In [46]:
df_concat_2 = pd.concat([df_process_sequences_train, df_process_sequences_test1, df_process_sequences_test2], axis=0)
df_concat_2.shape

(11414, 4)

In [47]:
df_concat_2.to_csv("../../process_data/mutations_categorical/DeepMutSol_export_data.ipynb", index=False)