In [1]:
k = 4

In [2]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np

def generate_combinations(alphabet, k):
    if k == 0: return ['']
        
    combinations = []
    for char in alphabet:
        for suffix in generate_combinations(alphabet, k - 1):
            combinations.append(char + suffix)
    
    return combinations

In [3]:
set_kmer = generate_combinations(alphabet=['A', 'C', 'G', 'T'], k=k)
set_type = {kmer: np.float16 for kmer in set_kmer}

In [4]:
dfData = pd.read_csv(f'data/DATA_ITS_genus_{k}mer.csv', dtype=set_type)

In [5]:
dfData.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104500 entries, 0 to 104499
Columns: 258 entries, Genus to TTTT
dtypes: float16(256), object(2)
memory usage: 65.2 MB


In [6]:
dfData

Unnamed: 0,Genus,Species,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,Absidia,Absidia_spinosa,1.816406,0.237427,-0.946777,2.605469,-0.157227,0.632324,-0.157227,0.237427,...,-0.551758,-0.157227,1.421875,0.237427,0.237427,-0.551758,2.605469,1.026367,1.421875,10.890625
1,Absidia,Absidia_sp,3.476562,-0.510254,-0.067444,3.033203,-0.067444,-0.067444,-0.067444,0.375488,...,-0.510254,1.260742,1.260742,0.375488,1.260742,-0.067444,0.818359,2.589844,1.260742,9.679688
2,Absidia,Absidia_sp,1.476562,0.442383,-0.074707,1.993164,0.442383,-0.074707,0.959473,-0.074707,...,-0.591797,3.027344,-0.591797,-0.074707,0.959473,-0.074707,1.993164,1.993164,0.959473,3.027344
3,Absidia,Absidia_sp,2.394531,-0.622559,-0.622559,4.406250,-0.119812,-0.622559,-0.119812,-0.119812,...,-0.622559,2.896484,0.885742,-0.119812,2.394531,0.383057,0.885742,3.902344,2.394531,4.406250
4,Absidia,Absidia_sp,3.980469,-0.451660,-0.048798,2.771484,-0.048798,1.562500,-0.048798,0.756836,...,-0.451660,1.562500,-0.048798,-0.451660,1.160156,-0.048798,0.756836,3.173828,1.160156,10.023438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104495,Zyzygomyces,Zyzygomyces_bachmannii,3.101562,2.007812,-1.272461,2.007812,0.914062,-0.179443,0.367432,1.460938,...,-1.272461,-0.179443,1.460938,1.460938,1.460938,-0.726074,0.367432,-0.726074,1.460938,-0.179443
104496,Zyzygomyces,Zyzygomyces_bachmannii,3.097656,2.007812,-1.262695,2.007812,0.917969,-0.172485,0.372559,2.007812,...,-1.262695,-0.172485,1.462891,1.462891,1.462891,-0.717773,0.372559,-0.717773,1.462891,-0.172485
104497,Zyzygomyces,Zyzygomyces_bachmannii,3.507812,1.739258,-1.208984,2.328125,1.149414,-0.619629,0.559570,1.739258,...,-1.208984,-0.029938,1.739258,1.739258,1.149414,-0.619629,0.559570,-0.619629,1.739258,-0.029938
104498,Zyzygomyces,Zyzygomyces_bachmannii,3.105469,2.013672,-1.265625,2.013672,0.919922,-0.172852,0.373535,1.466797,...,-1.265625,-0.172852,1.466797,1.466797,1.466797,-0.719238,0.373535,-0.719238,1.466797,-0.172852


In [7]:
print('# genus  : ', len(dfData['Genus'].unique()))
print('# species: ', len(dfData['Species'].unique()))
print('# samples: ', dfData.shape[0])

# genus  :  1045
# species:  11954
# samples:  104500


In [8]:
import torch

print("PyTorch 버전:", torch.__version__)
print("CUDA 사용 가능 여부:", torch.cuda.is_available())

PyTorch 버전: 2.1.0
CUDA 사용 가능 여부: True


In [9]:
X = dfData.iloc[:, 2:].values

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

y = dfData['Genus']
y_encoded = label_encoder.fit_transform(y)
num_classes = max(y_encoded) + 1

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

n_folds = 10
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

In [12]:
import time
from sklearn.naive_bayes import GaussianNB

In [13]:
from sklearn.metrics import accuracy_score

n_iter = 0
X_tensor = torch.from_numpy(X).float()
y_tensor = torch.from_numpy(y_encoded).long()

for train_index, test_index in kf.split(X_tensor, y_tensor):
    n_iter += 1
    
    X_train, X_test = X_tensor[train_index], X_tensor[test_index]
    y_train, y_test = y_tensor[train_index], y_tensor[test_index]

    time_st = time.time()
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    time_ed = time.time()

    name = f'GAUSSIAN_ITS_{k}mer'
    if not os.path.isdir('results/' + name):
        os.mkdir('results/' + name)
        
    with open(f'results/{name}/Fold_{n_iter:02}.csv', 'w') as f_out:
        for i in range(len(y_pred)):
            f_out.write(str(int(y_test[i])) + ',' + str(y_pred[i]) + '\n')
            
    

    print(f'Fold {n_iter} test accuracy: {test_accuracy}')
    with open(f'results/{name}/test_acc', 'a') as f:
        f.write(str(test_accuracy) + ',')
    with open(f'results/{name}/test_time', 'a') as f:
        f.write(str(time_ed - time_st) + ',')

Fold 1 test accuracy: 0.879043062200957
Fold 2 test accuracy: 0.8744497607655503
Fold 3 test accuracy: 0.880956937799043
Fold 4 test accuracy: 0.8795215311004785
Fold 5 test accuracy: 0.8845933014354067
Fold 6 test accuracy: 0.8822966507177034
Fold 7 test accuracy: 0.8784688995215311
Fold 8 test accuracy: 0.883444976076555
Fold 9 test accuracy: 0.8816267942583732
Fold 10 test accuracy: 0.875311004784689
