In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import seaborn as sns

import numpy as np

import itertools
from itertools import combinations

import xgboost
from xgboost import XGBClassifier


유전체 데이터 셋 불러오기

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

염색체 중에서 염기 서열의 종류 확인

In [3]:
col_train = train.columns

for i in range(4, len(col_train)-1):
    print(train[col_train[i]].unique())

[2 1]
['G G' 'A G' 'A A']
['A G' 'G G' 'A A']
['A A' 'C A' 'C C']
['G A' 'A A' 'G G']
['C A' 'A A' 'C C']
['A A' 'A G' 'G G']
['A A' 'G G' 'G A']
['G G' 'G A' 'A A']
['A A' 'G A' 'G G']
['G G' 'A G' 'A A']
['A G' 'A A' 'G G']
['A A' 'G A' 'G G']
['A A' 'G G' 'A G']
['A A' 'C C' 'C A']
['A A' 'G G' 'G A']


공란으로 돼있는 열과 id칸 삭제

In [4]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [5]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

LabelEncoder 과정

In [6]:
class_le = LabelEncoder()
snp_le = LabelEncoder()
snp_col = ['trait', 'SNP_01', 'SNP_02',
       'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09',
       'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15']
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)
    
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

In [7]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

snp_info 데이터셋 불러오기

In [8]:
info = pd.read_csv('snp_info.csv')
info

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


chrom(염색체)별로 리스트 생성하기

In [9]:
id = info['SNP_id']
chrom_2 = []
chrom_6 = []
chrom_7 = []
chrom_8 = []
chrom_9 = []
chrom_10 = []

for i, j  in enumerate(info['chrom']):
    if j == 2:
        chrom_2.append(id[i])
    elif j == 6:
        chrom_6.append(id[i])
    elif j == 7:
        chrom_7.append(id[i])
    elif j == 8:
        chrom_8.append(id[i])
    elif j == 9:
        chrom_9.append(id[i])
    else:
        chrom_10.append(id[i])
        

snp 분석에 사용되는 품종 구분은 특정 염색체와 관련이 있을 거라 추론하고 그룹을 만들어 가장 좋은 정확도를 보이는 그룹으로 결정

In [10]:
chrom = [chrom_2, chrom_6, chrom_7, chrom_8, chrom_9, chrom_10]

# 염색체 3개 그룹씩 묶기
comb_3_chrom = list(combinations(chrom, 3))
# 염색체 4개 그룹씩 묶기
comb_4_chrom = list(combinations(chrom, 4))
# 전체 그룹 리스트로 생성
comb_chrom = comb_3_chrom + comb_4_chrom

model_list = []
for i in comb_chrom:
    snp_group = list(itertools.chain(*i))
    snp_group.append('trait')
    x = train_x[snp_group]
    x_train, x_test, y_train, y_test = train_test_split(x, train_y,
    test_size=0.2, random_state=42)

    xgb= XGBClassifier()
    xgb_param_grid={
    'n_estimators' : [100,200,300,400,500, 600],
    'learning_rate' : [0.01,0.05,0.1],
    'max_depth' : [2, 3, 4, 5]
    }
    xgb_grid=GridSearchCV(xgb, param_grid = xgb_param_grid, scoring="f1_macro", cv=5)
    xgb_grid.fit(x_train, y_train)
    
    model = xgb_grid.best_estimator_
    pred = model.predict(x_test)
    from sklearn import metrics
    print(metrics.f1_score(pred, y_test, average='macro'), snp_group)
    model_list.append(model)

0.9124579124579125 ['SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09', 'SNP_10', 'trait']
0.9293478260869565 ['SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09', 'SNP_11', 'trait']
0.9107755662319835 ['SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09', 'SNP_12', 'SNP_13', 'SNP_14', 'trait']
0.9136212624584719 ['SNP_01', 'SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09', 'SNP_15', 'trait']
0.8885714285714287 ['SNP_01', 'SNP_10', 'SNP_11', 'trait']
0.8750857927247769 ['SNP_01', 'SNP_10', 'SNP_12', 'SNP_13', 'SNP_14', 'trait']
0.8626415094339622 ['SNP_01', 'SNP_10', 'SNP_15', 'trait']
0.8142857142857142 ['SNP_01', 'SNP_11', 'SNP_12', 'SNP_13', 'SNP_14', 'trait']
0.7621393384940184 ['SNP_01', 'SNP_11', 'SNP_15', 'trait']
0.8514285714285714 ['SNP_01', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15', 'trait']
0.9107755662319835 ['SNP_02', 'SNP_03', 'SNP_04', 

In [11]:
# 모든 염색체 사용
all_chrom =['SNP_01', 'SNP_02',
       'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09',
       'SNP_10', 'SNP_11', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15', 'trait']

x = train_x[all_chrom]
x_train, x_test, y_train, y_test = train_test_split(x, train_y,
test_size=0.2, random_state=42)

xgb= XGBClassifier()
xgb_param_grid={
'n_estimators' : [100,200,300,400,500, 600],
'learning_rate' : [0.01,0.05,0.1],
'max_depth' : [2, 3, 4, 5]
}
xgb_grid=GridSearchCV(xgb, param_grid = xgb_param_grid, scoring="f1_macro", cv=5)
xgb_grid.fit(x_train, y_train)

model = xgb_grid.best_estimator_
pred = model.predict(x_test)
from sklearn import metrics
print(metrics.f1_score(pred, y_test, average='macro'))
model_list.append(model)


0.9638888888888889


In [12]:
## 0.9646739130434782 ['SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06', 'SNP_07', 'SNP_08', 'SNP_09', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15', 'trait'], model_list[15]가 macro score가 가장 높은 것을 확인
# 즉 염색체 6, 9, 10이 최적의 그룹으로 선정

최적의 그룹으로 선정된 열을 제외한 나머지 염색체 drop

In [13]:
test_drop = test_x[['SNP_02', 'SNP_03', 'SNP_04', 'SNP_05', 'SNP_06',
       'SNP_07', 'SNP_08', 'SNP_09', 'SNP_12', 'SNP_13', 'SNP_14', 'SNP_15', 'trait']]

In [14]:
pred = model_list[15].predict(test_drop)

sample = pd.read_csv('sample_submission.csv')
sample['class'] = class_le.inverse_transform(pred)
sample

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,C
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


파일 제출

In [15]:
sample.to_csv('submission.csv', index=None)