In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# 파일 디렉토리 및 경로 설정
file_dir = os.path.join('train_data')
file_path = os.path.join(file_dir, 'protein_train_data.csv')

# 데이터 로드
df = pd.read_csv(file_path)

# 데이터 프레임을 랜덤으로 섞기
df = df.sample(frac=1).reset_index(drop=True)

# 비어있는 값이 있는 행 찾기
empty_rows = df[df.isna().any(axis=1)]

# 결과 출력
if not empty_rows.empty:
    print("There are rows with missing values:")
    print(empty_rows)
else:
    print("No missing values found in any row.")

No missing values found in any row.


In [3]:
print(df)

                                                  Smiles  \
0      CONC(=O)c1cc2c(N[C@@H](C)c3ccccc3)nc(-n3cnc4cc...   
1      Cc1ccc(C(=O)Nc2ccc(CN3CCN(C)CC3)c(C(F)(F)F)c2)...   
2      O=c1[nH]c(N2CCOCC2)nc(N[C@@H]2CCCNC2)c1-c1nc2c...   
3      COc1n[nH]c2ncc(NC(=O)c3c(F)ccc(NS(=O)(=O)Cc4cc...   
4      COc1cc(OC)cc(N(CC(O)CO)c2ccc3ncc(-c4cnn(C)c4)n...   
...                                                  ...   
57645  CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...   
57646  CN1CCN(c2ccc(-c3ccc4[nH]c5nccc(C#Cc6ccccc6Cl)c...   
57647  Cc1ccc(F)c(NC(=O)c2ccc(F)c(Oc3ccnc(-c4cc(C(=O)...   
57648  CN1CCN(c2ccc(-c3nc(-c4ccc5c(c4)Cc4cn[nH]c4-5)c...   
57649  COc1ccc(NC(=O)c2ccc(C)c(Nc3ncnc4cnc(N(C)CC5CCC...   

                                         Target Name  Standard Value  
0                       Tyrosine-protein kinase JAK3          18.000  
1                        Tyrosine-protein kinase ABL           0.410  
2         Interleukin-1 receptor-associated kinase 4          19.0

In [4]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# 결과 확인
print(train_df.shape)
print(train_df.head())
print(val_df.shape)
print(val_df.head())

(51885, 3)
                                                  Smiles  \
2338       Oc1ccc(-c2ccc3ncnc(Nc4cccc5[nH]ncc45)c3c2)cc1   
19077  Cc1ccc(C(=O)Nc2cc(C(C)(C)C)n[nH]2)cc1Nc1ncnc2c...   
17252  N#Cc1c(-c2ccccc2)cc(-c2ccccc2)nc1/N=c1\sc(-c2c...   
25927  C#CCN1CCN(c2ccc(-c3ccc4[nH]c5nccc(-c6ccc(Cc7cc...   
22389  O=C(N1CCN(c2ncnc3[nH]ccc23)CC1)C1(c2ccc(Br)cc2...   

                                 Target Name  Standard Value  
2338   Serine/threonine-protein kinase B-raf           547.3  
19077  Serine/threonine-protein kinase B-raf             5.0  
17252   Serine/threonine-protein kinase Chk1             9.9  
25927           ALK tyrosine kinase receptor          3600.0  
22389    Serine/threonine-protein kinase AKT            27.1  
(5765, 3)
                                                  Smiles  \
56679  C[C@@H]1CNC[C@@H](C(=O)Nc2ncc(SCc3ncc(C(C)(C)C...   
224    CC(C)(O)c1cc2nn(C[C@@H]3CCC(=O)N3)cc2cc1NC(=O)...   
32457  Nc1n[nH]c2cncc(-c3ccc(NC(=O)Nc4cc(C(F)(F)F)ccc...   


In [5]:
train_df.to_csv('train_data/dacon_train.csv', index=False)
val_df.to_csv('train_data/dacon_val.csv', index=False)