# Import the necessary libraries

In [1]:
import pandas as pd

# Load the Raw dataset into  Pandas Dataframe 

In [2]:
df1 = pd.read_csv("../datasets/raw_dataset/1625Data.txt", delimiter=",")
df2 = pd.read_csv("../datasets/raw_dataset/746Data.txt", delimiter=",")
df3 = pd.read_csv("../datasets/raw_dataset/impensData.txt", delimiter=",")
df4 = pd.read_csv("../datasets/raw_dataset/schillingData.txt", delimiter=",")

# Change the Column Names of Our Files to a more Descriptive Names which will also help us to easily Apend them to each other

In [3]:
df1.head(4)

Unnamed: 0,SLNLRETN,1
0,AECFRIFD,1
1,HLVEALYL,1
2,TQIMFETF,1
3,AEELAEIF,1


In [4]:
df1.rename(columns ={"SLNLRETN": "octamer", "1": "is_cleaved"}, inplace=True)
df1.head(5)

Unnamed: 0,octamer,is_cleaved
0,AECFRIFD,1
1,HLVEALYL,1
2,TQIMFETF,1
3,AEELAEIF,1
4,PFIFEEEP,1


In [5]:
df2.head(4)

Unnamed: 0,AAAKFERQ,-1
0,AAAMKRHG,-1
1,AAAMSSAI,-1
2,AAKFERQH,-1
3,AAKFESNF,-1


In [6]:
df2.rename(columns ={"AAAKFERQ": "octamer", "-1": "is_cleaved"}, inplace=True)
df2.head()

Unnamed: 0,octamer,is_cleaved
0,AAAMKRHG,-1
1,AAAMSSAI,-1
2,AAKFERQH,-1
3,AAKFESNF,-1
4,AAMKRHGL,-1


In [7]:
df3.head(4)

Unnamed: 0,AAAGKSGG,-1
0,AAAVDAGM,-1
1,AAGKSGGG,-1
2,AALALEYG,1
3,AANDGPMP,-1


In [8]:
df3.rename(columns ={"AAAGKSGG": "octamer", "-1": "is_cleaved"}, inplace=True)
df3.head()

Unnamed: 0,octamer,is_cleaved
0,AAAVDAGM,-1
1,AAGKSGGG,-1
2,AALALEYG,1
3,AANDGPMP,-1
4,AASAAAVD,-1


In [9]:
df4.head(4)

Unnamed: 0,AAAAAPAK,-1
0,AAAAPAKV,-1
1,AAAELGAR,-1
2,AAAPAKVE,-1
3,AAAPVAAA,-1


In [10]:
df4.rename(columns ={"AAAAAPAK": "octamer", "-1": "is_cleaved"}, inplace=True)
df4.head()

Unnamed: 0,octamer,is_cleaved
0,AAAAPAKV,-1
1,AAAELGAR,-1
2,AAAPAKVE,-1
3,AAAPVAAA,-1
4,AAAPVVPQ,-1


# Combine the Datasets

In [11]:
combined_data = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [12]:
combined_data.shape

(6586, 2)

# Remove Duplicates

In [13]:
combined_data_no_duplicates = combined_data.drop_duplicates(keep="first")

In [14]:
combined_data_no_duplicates.shape

(5848, 2)

# Split the combined_data_no_duplicates into two based on cleaved and uncleaved octapeptide

In [17]:
cleaved_octapeptide_df = combined_data_no_duplicates[combined_data_no_duplicates.is_cleaved == 1]
uncleaved_octapeptide_df = combined_data_no_duplicates[combined_data_no_duplicates.is_cleaved == -1]

In [18]:
cleaved_octapeptide_df

Unnamed: 0,octamer,is_cleaved
0,AECFRIFD,1
1,HLVEALYL,1
2,TQIMFETF,1
3,AEELAEIF,1
4,PFIFEEEP,1
...,...,...
6534,YDDMLVVP,1
6548,YHEMIESG,1
6560,YNVSLLYD,1
6572,YSAFLVAD,1


In [19]:
cleaved_octapeptide_df.shape

(1001, 2)

In [20]:
uncleaved_octapeptide_df.tail(5)

Unnamed: 0,octamer,is_cleaved
6581,YVPVTTFK,-1
6582,YVQQHGGV,-1
6583,YVSNIDGT,-1
6584,YWPQEAGE,-1
6585,YYTSASGD,-1


In [21]:
uncleaved_octapeptide_df.shape

(4847, 2)

# Dropping the Labels from the split Datasets

In [22]:
cleaved_octapeptide_df_no_label = cleaved_octapeptide_df.drop("is_cleaved", axis=1)
uncleaved_octapeptide_df_no_label = uncleaved_octapeptide_df.drop("is_cleaved", axis=1)

In [23]:
cleaved_octapeptide_df_no_label

Unnamed: 0,octamer
0,AECFRIFD
1,HLVEALYL
2,TQIMFETF
3,AEELAEIF
4,PFIFEEEP
...,...
6534,YDDMLVVP
6548,YHEMIESG
6560,YNVSLLYD
6572,YSAFLVAD


In [24]:
uncleaved_octapeptide_df_no_label

Unnamed: 0,octamer
24,AAKFERQH
25,MDSSTSAA
26,SSNYCNQM
27,TPGSRNLC
28,GSSKYPNC
...,...
6581,YVPVTTFK
6582,YVQQHGGV
6583,YVSNIDGT
6584,YWPQEAGE


# Saving the cleaved_octapeptide_df_no_label and uncleaved_octapeptide_df_no_label in both csv and txt formats

In [25]:
# cleaved_octapeptide_df_no_label.to_csv("../datasets/prepared_dataset_for_feature_extraction_seq_logo_generation/cleaved_octapeptide_df_no_label.csv", index=False)
# uncleaved_octapeptide_df_no_label.to_csv("../datasets/prepared_dataset_for_feature_extraction_seq_logo_generation/uncleaved_octapeptide_df_no_label.csv", index=False)

# cleaved_octapeptide_df_no_label.to_csv("../datasets/prepared_dataset_for_feature_extraction_seq_logo_generation/cleaved_octapeptide_df_no_label.txt", index=False)
# uncleaved_octapeptide_df_no_label.to_csv("../datasets/prepared_dataset_for_feature_extraction_seq_logo_generation/uncleaved_octapeptide_df_no_label.txt", index=False)