### Import library

In [None]:
import pandas as pd

In [None]:
# Parquet 파일에서 데이터 읽어오기
df = pd.read_parquet('/kaggle/input/dacon-click-log-parquet-pure/train.parquet')

# Click 컬럼이 1인 행만 필터링
click_zero = df[df['Click'] == 0]

# 랜덤으로 5569860 행 선택
click_zero = click_zero.sample(n=5569860, random_state=42)

# Click 컬럼이 1인 행만 필터링
click_one = df[df['Click'] == 1]

In [None]:
# 두 DataFrame을 합치기
total_df = pd.concat([click_one, click_zero])

In [None]:
# 시간의 순서대로 정렬하기
total_df = total_df.sort_index()

In [None]:
total_df.reset_index(drop=True, inplace=True)

In [None]:
import numpy as np
def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2  
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue

        # 수치형 데이터 최적화
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue

    end_mem = df.memory_usage().sum() / 1024**2  
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df = reduce_mem_usage(total_df)

In [None]:
#피처엔지니어링 통한 중요도 순 하위 10개
df = df.drop(['ID','F03','F01','F27','F35','F38','F12','F05','F22','F23','F30'],axis=1)

In [None]:
df['F04'] = df['F04'].fillna(-1)
df['F11'] = df['F11'].fillna(-1)
df['F18'] = df['F18'].fillna(-1)
df['F19'] = df['F19'].fillna(-1)
df['F24'] = df['F24'].fillna(-1)
df['F29'] = df['F29'].fillna(-1)
df['F32'] = df['F32'].fillna(-1)
df['F33'] = df['F33'].fillna(-1)
df['F36'] = df['F36'].fillna(-1)
df = df.fillna('-1')

object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].astype('category')
df.info()

In [None]:
df.to_parquet('train.parquet')

In [1]:
import pandas as pd
df = pd.read_parquet('/kaggle/working/train.parquet')

In [2]:
!pip install catboost



In [4]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

### Load data

In [5]:
df["Click"].value_counts(normalize=True)

Click
1    0.5
0    0.5
Name: proportion, dtype: float64

In [6]:
df["Click"].value_counts()

Click
1    5569860
0    5569860
Name: count, dtype: int64

In [7]:
def load_data(df):
    df = df.drop('ID',axis=1)
    
    df['F04'] = df['F04'].fillna(-1)
    df['F11'] = df['F11'].fillna(-1)
    df['F18'] = df['F18'].fillna(-1)
    df['F19'] = df['F19'].fillna(-1)
    df['F24'] = df['F24'].fillna(-1)
    df['F27'] = df['F27'].fillna(-1)
    df['F29'] = df['F29'].fillna(-1)
    df['F32'] = df['F32'].fillna(-1)
    df['F33'] = df['F33'].fillna(-1)
    df['F36'] = df['F36'].fillna(-1)
    df['F38'] = df['F38'].fillna(-1)
    df = df.fillna('-1')
    
    object_columns = df.select_dtypes(include=['object']).columns
    df[object_columns] = df[object_columns].astype('category')
    return df

In [8]:
test_df = pd.read_parquet('/kaggle/input/dacon-click-log-parquet-pure/test.parquet')

In [9]:
test_df.head()

Unnamed: 0,ID,F01,F02,F03,F04,F05,F06,F07,F08,F09,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TEST_0000000,,,IAGJDOH,,,-1,WBNHHNS,MIGYEEG,NENNAGQ,...,MFPUCBU,GTISJWW,1144.0,2.0,,IRUDRFB,,VINCJKA,0.0,AWWBMDJ
1,TEST_0000001,SDVLQID,VDDLZCR,DTZFPRW,1.0,KJLEQJC,3,OWLMRBR,MIGYEEG,UKFZEAD,...,NZGEZLW,GTISJWW,44364.0,0.0,ORGHLQN,IRUDRFB,,KSGNDBR,0.0,OJTGSWO
2,TEST_0000002,XXQINZY,AXQDKZD,IAGJDOH,12.0,NFKKMGD,2,ILCWUTM,LPYPUNA,NENNAGQ,...,VHXETCF,GTISJWW,79.0,1.0,WRRYBLO,IRUDRFB,,VINCJKA,0.0,INBFMRT
3,TEST_0000003,JCDXFYU,PILDDJU,IAGJDOH,22.0,LFPUEOV,209,HUHNWCJ,FTPHMPQ,FPCZMEO,...,NZGEZLW,KHZNEZF,44.0,9.0,QMOULXS,IRUDRFB,8.0,RWQBLLR,0.0,OYSGPBR
4,TEST_0000004,DZPQPXK,HKSCOJF,,2.0,FMKTHYA,2,CFQLINN,FTPHMPQ,YOPLVLN,...,FGOVFJM,WCPHYPZ,1.0,2.0,WZAQBGE,IRUDRFB,0.0,XJLBFMB,1.0,OYSGPBR


In [10]:
import numpy as np
def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue

        # 수치형 데이터 최적화
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 1073.42 MB
Decreased by 22.5%


In [12]:
test_df = load_data(test_df)

In [13]:
#피처엔지니어링 통한 중요도 순 하위 10개
test_df = test_df.drop(['F03','F01','F27','F35','F38','F12','F05','F22','F23','F30'],axis=1)

In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4538541 entries, 0 to 4538540
Data columns (total 29 columns):
 #   Column  Dtype   
---  ------  -----   
 0   F02     category
 1   F04     float32 
 2   F06     int16   
 3   F07     category
 4   F08     category
 5   F09     category
 6   F10     category
 7   F11     float16 
 8   F13     category
 9   F14     int16   
 10  F15     category
 11  F16     category
 12  F17     category
 13  F18     float16 
 14  F19     float16 
 15  F20     category
 16  F21     category
 17  F24     float32 
 18  F25     category
 19  F26     category
 20  F28     category
 21  F29     float16 
 22  F31     category
 23  F32     float32 
 24  F33     float16 
 25  F34     category
 26  F36     float16 
 27  F37     category
 28  F39     category
dtypes: category(18), float16(6), float32(3), int16(2)
memory usage: 339.3 MB


In [15]:
df = reduce_mem_usage(df)

Memory usage of dataframe is 837.25 MB
Memory usage after optimization is: 837.25 MB
Decreased by 0.0%


In [16]:
X = df.drop('Click',axis=1)
y = df['Click']

In [17]:
X.head()

Unnamed: 0,F02,F04,F06,F07,F08,F09,F10,F11,F13,F14,...,F26,F28,F29,F31,F32,F33,F34,F36,F37,F39
0,AVKQTCL,114.0,1,PQZBVMG,LPYPUNA,IZYJZDA,RANQNXO,66.0,SMRBWMU,4,...,XAUNDQW,MAVCFCM,1.0,GTISJWW,380.0,2.0,AXQFZWC,-1.0,TFJMLCZ,AURZYDY
1,PILDDJU,119.0,0,FFUTIRZ,OFKQGTY,BEZTQIO,YLKUVQA,-1.0,KGJACUM,0,...,WVRHGBC,VNBXRYV,-1.0,KHZNEZF,197.0,0.0,QMOULXS,8.0,ZVSTLNM,MHBRSQK
2,ZYAVJHP,15.0,26,ZDTZNSB,THBWWCD,LTETYBG,GEKHGQZ,50.0,UMIEGWH,20,...,-1,YQCIJMD,1.0,GTISJWW,8640.0,0.0,IZLJUJS,14.0,ZBSRLCQ,GAZBSSZ
3,QPQWGXA,13.0,20,PQZBVMG,MIGYEEG,LJBQPJW,SOKLCDW,789.0,YJFFIGH,42,...,-1,IYZTLFQ,-1.0,WHSRKIM,41774.0,0.0,BHBIZCL,13.0,QHYLSBX,QTATWAY
4,PILDDJU,-1.0,2,DYFKVIS,OFKQGTY,EQAQGGV,YLKUVQA,106.0,PQEGPID,0,...,WVRHGBC,VNBXRYV,1.0,KHZNEZF,7768.0,-1.0,QMOULXS,-1.0,UPVVRGF,KHPZSFA


In [18]:
numerical = []
categorical = []

for col in (X.columns):
    if X[col].dtype == "category" or X[col].dtype == "object" :
        categorical.append(col)
    else:
        numerical.append(col)
print("numerical features = ",numerical)
print("\ncategorical features = ",categorical)

numerical features =  ['F04', 'F06', 'F11', 'F14', 'F18', 'F19', 'F24', 'F29', 'F32', 'F33', 'F36']

categorical features =  ['F02', 'F07', 'F08', 'F09', 'F10', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F25', 'F26', 'F28', 'F31', 'F34', 'F37', 'F39']


In [19]:
X.head()

Unnamed: 0,F02,F04,F06,F07,F08,F09,F10,F11,F13,F14,...,F26,F28,F29,F31,F32,F33,F34,F36,F37,F39
0,AVKQTCL,114.0,1,PQZBVMG,LPYPUNA,IZYJZDA,RANQNXO,66.0,SMRBWMU,4,...,XAUNDQW,MAVCFCM,1.0,GTISJWW,380.0,2.0,AXQFZWC,-1.0,TFJMLCZ,AURZYDY
1,PILDDJU,119.0,0,FFUTIRZ,OFKQGTY,BEZTQIO,YLKUVQA,-1.0,KGJACUM,0,...,WVRHGBC,VNBXRYV,-1.0,KHZNEZF,197.0,0.0,QMOULXS,8.0,ZVSTLNM,MHBRSQK
2,ZYAVJHP,15.0,26,ZDTZNSB,THBWWCD,LTETYBG,GEKHGQZ,50.0,UMIEGWH,20,...,-1,YQCIJMD,1.0,GTISJWW,8640.0,0.0,IZLJUJS,14.0,ZBSRLCQ,GAZBSSZ
3,QPQWGXA,13.0,20,PQZBVMG,MIGYEEG,LJBQPJW,SOKLCDW,789.0,YJFFIGH,42,...,-1,IYZTLFQ,-1.0,WHSRKIM,41774.0,0.0,BHBIZCL,13.0,QHYLSBX,QTATWAY
4,PILDDJU,-1.0,2,DYFKVIS,OFKQGTY,EQAQGGV,YLKUVQA,106.0,PQEGPID,0,...,WVRHGBC,VNBXRYV,1.0,KHZNEZF,7768.0,-1.0,QMOULXS,-1.0,UPVVRGF,KHPZSFA


In [20]:
from sklearn.preprocessing import RobustScaler

#RobustScaler 적용
scaler = RobustScaler()
X[numerical] = scaler.fit_transform(X[numerical])

In [21]:
test_df[numerical] = scaler.transform(test_df[numerical])

In [22]:
X.head()

Unnamed: 0,F02,F04,F06,F07,F08,F09,F10,F11,F13,F14,...,F26,F28,F29,F31,F32,F33,F34,F36,F37,F39
0,AVKQTCL,9.166667,-0.076923,PQZBVMG,LPYPUNA,IZYJZDA,RANQNXO,-0.18481,SMRBWMU,0.0,...,XAUNDQW,MAVCFCM,0.0,GTISJWW,-0.172917,2.0,AXQFZWC,-0.428571,TFJMLCZ,AURZYDY
1,PILDDJU,9.583333,-0.096154,FFUTIRZ,OFKQGTY,BEZTQIO,YLKUVQA,-0.35443,KGJACUM,-0.307692,...,WVRHGBC,VNBXRYV,-0.5,KHZNEZF,-0.194636,0.0,QMOULXS,0.857143,ZVSTLNM,MHBRSQK
2,ZYAVJHP,0.916667,0.403846,ZDTZNSB,THBWWCD,LTETYBG,GEKHGQZ,-0.225316,UMIEGWH,1.230769,...,-1,YQCIJMD,0.0,GTISJWW,0.807382,0.0,IZLJUJS,1.714286,ZBSRLCQ,GAZBSSZ
3,QPQWGXA,0.75,0.288462,PQZBVMG,MIGYEEG,LJBQPJW,SOKLCDW,1.64557,YJFFIGH,2.923077,...,-1,IYZTLFQ,-0.5,WHSRKIM,4.739734,0.0,BHBIZCL,1.571429,QHYLSBX,QTATWAY
4,PILDDJU,-0.416667,-0.057692,DYFKVIS,OFKQGTY,EQAQGGV,YLKUVQA,-0.083544,PQEGPID,-0.307692,...,WVRHGBC,VNBXRYV,0.0,KHZNEZF,0.703893,-1.0,QMOULXS,-0.428571,UPVVRGF,KHPZSFA


In [25]:
test_df.head()

Unnamed: 0,F02,F04,F06,F07,F08,F09,F10,F11,F13,F14,...,F26,F28,F29,F31,F32,F33,F34,F36,F37,F39
0,-1,-0.416667,-0.115385,WBNHHNS,MIGYEEG,NENNAGQ,-1,-0.344304,WIFNKKO,0.307692,...,WVRHGBC,IYZTLFQ,0.0,GTISJWW,-0.082245,2.0,-1,-0.428571,VINCJKA,AWWBMDJ
1,VDDLZCR,-0.25,-0.038462,OWLMRBR,MIGYEEG,UKFZEAD,PUSHPSV,-0.149367,CNNPQRG,0.0,...,YYQVFBZ,IYZTLFQ,-0.5,GTISJWW,5.047116,0.0,ORGHLQN,-0.428571,KSGNDBR,OJTGSWO
2,AXQDKZD,0.666667,-0.057692,ILCWUTM,LPYPUNA,NENNAGQ,QSZWOEN,0.929114,WIFNKKO,0.692308,...,WVRHGBC,MAVCFCM,0.0,GTISJWW,-0.20864,1.0,WRRYBLO,-0.428571,VINCJKA,INBFMRT
3,PILDDJU,1.5,3.923077,HUHNWCJ,FTPHMPQ,FPCZMEO,YLKUVQA,0.172152,DZGASMB,0.153846,...,WVRHGBC,YYEFTXW,0.0,KHZNEZF,-0.212794,9.0,QMOULXS,0.857143,RWQBLLR,OYSGPBR
4,HKSCOJF,-0.166667,-0.057692,CFQLINN,FTPHMPQ,YOPLVLN,ZQEBKKM,-0.177215,AKOONVA,0.846154,...,-1,YYEFTXW,0.25,WCPHYPZ,-0.217897,2.0,WZAQBGE,-0.285714,XJLBFMB,OYSGPBR


In [26]:
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 408.56 MB
Memory usage after optimization is: 313.34 MB
Decreased by 23.3%


In [27]:
cat_features = categorical #catboost 명시적 지정 필요

In [28]:
cat_features

['F02',
 'F07',
 'F08',
 'F09',
 'F10',
 'F13',
 'F15',
 'F16',
 'F17',
 'F20',
 'F21',
 'F25',
 'F26',
 'F28',
 'F31',
 'F34',
 'F37',
 'F39']

In [29]:
from sklearn import metrics
def model_eval(model, X_test, y_test):
    # 예측 확률 계산
    y_pred_prob = model.predict_proba(X_test)
    y_pred = np.where(y_pred_prob[:, 1] > 0.5, 1, 0)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, y_pred_prob[:, 1])
    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
    return accuracy, cnf_matrix,roc_auc_score

In [30]:
sample_submission = pd.read_parquet('/kaggle/input/dacon-click-log-parquet-pure/sample_submission.parquet')

In [31]:
# GPU 디바이스 설정
devices = ['0', '1']

https://catboost.ai/en/docs/concepts/speed-up-training

In [32]:
#import lightgbm as lgb
import torch
import gc
import catboost as cb
from sklearn.model_selection import TimeSeriesSplit

# 시계열 교차 검증 설정
tscv = TimeSeriesSplit(n_splits=7)

auc_scores = []
accuracy_scores = []
pred_result = []
predYN_result = []
feature_importances = []
count = 0 

# 시계열 교차 검증 루프
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    count = count + 1
    print(f"{count} 교차검증 start")
    
    if count == 4 or count == 5 or count == 6 or count == 7 or count == 3:
        
        #실제 count 3은 "전체데이터" 를 가지고 학습하는 걸로 생각하기로 함
        learning_rate = [0.4,0.4,0.4,0.5,0.5,0.5,0.5,0.5]
        if count == 3:
            X_train = X
            y_train = y
        
        print(learning_rate[count])

        model_cb = cb.CatBoostClassifier(
            bootstrap_type='Poisson',
            subsample=0.7,
            task_type='GPU',
            gpu_ram_part=0.99,
            iterations=1500,
            learning_rate = learning_rate[count],
            random_seed=42,
            loss_function='Logloss',
            verbose=1,
            thread_count=-1,
            devices=devices,
            used_ram_limit='28gb',
            eval_metric='AUC',
            early_stopping_rounds=50,
            metric_period=50
        )
        
        if count == 3:
            model_cb = model_cb.fit(X_train, y_train, eval_set=[(X_train, y_train)], cat_features=cat_features)
        else:
            model_cb = model_cb.fit(X_train, y_train, eval_set=[(X_test, y_test)], cat_features=cat_features)
        
        if count == 4 or count == 5 or count == 6 or count == 7:
            accuracy, cnf_matrix, auc = model_eval(model_cb, X_test, y_test)

        #예측결과 저장
        pred = model_cb.predict_proba(test_df)

        # 클래스 예측 값 (확률 값에서 직접 결정)
        predYN = np.where(pred[:, 1] > 0.5, 1, 0)

        # 피처 중요도 평균 계산
        # 피처 중요도 추출
#         feature_importances.append(model_lgbm.get_feature_importance())
#         avg_feature_importances = np.mean(feature_importances, axis=0)
#         feature_names = X.columns
#         feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': avg_feature_importances})
#         feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

#         print("Feature Importances:")
#         print(feature_importance_df)


        pred_result.append(pred[:,1])
        predYN_result.append(predYN)
        
        #명시적 메모리 삭제
        del model_cb, X_train, X_test, y_train, y_test, pred, predYN
        gc.collect()
        torch.cuda.empty_cache()
        
        if count == 4 or count == 5 or count == 6 or count == 7:
            auc_scores.append(auc)
            accuracy_scores.append(accuracy)
            print(f"Fold ROC AUC Score: {auc}")
            print(f"Fold Accuracy Score: {accuracy}")
            print(cnf_matrix)
            print(f"{count} 교차검증 끝")

            # 전체 교차 검증 결과 출력
            print(f"Mean ROC AUC Score: {np.mean(auc_scores)}")
            print(f"Mean Accuracy Score: {np.mean(accuracy_scores)}")  
        
        average_array = np.mean(pred_result, axis=0)
        sample_submission['Click'] = average_array
        sample_submission.to_csv('catboost_sampling_category_balanced_timesplit_fold_7_half_half_iter_1500_all_no_pre_sampling_7_4_5_6_7_all_average.csv', index=False)

1 교차검증 start
2 교차검증 start
3 교차검증 start
0.5
0:	test: 0.7126778	best: 0.7126778 (0)	total: 12.9s	remaining: 5h 23m 12s
50:	test: 0.8137637	best: 0.8137637 (50)	total: 1m 49s	remaining: 52m 2s
100:	test: 0.8245378	best: 0.8245378 (100)	total: 3m 25s	remaining: 47m 27s
150:	test: 0.8277618	best: 0.8277618 (150)	total: 5m 2s	remaining: 45m 3s
200:	test: 0.8315794	best: 0.8315794 (200)	total: 6m 40s	remaining: 43m 10s
250:	test: 0.8333296	best: 0.8336848 (240)	total: 8m 17s	remaining: 41m 17s
300:	test: 0.8347524	best: 0.8347524 (300)	total: 9m 56s	remaining: 39m 37s
350:	test: 0.8363943	best: 0.8363943 (350)	total: 11m 31s	remaining: 37m 45s
400:	test: 0.8375150	best: 0.8376637 (395)	total: 13m 8s	remaining: 36m
450:	test: 0.8383090	best: 0.8383096 (449)	total: 14m 44s	remaining: 34m 16s
500:	test: 0.8386062	best: 0.8386780 (470)	total: 16m 21s	remaining: 32m 37s
550:	test: 0.8393504	best: 0.8393504 (550)	total: 17m 58s	remaining: 30m 57s
600:	test: 0.8394082	best: 0.8395054 (566)	total: 19

In [34]:
np.bincount(predYN_result[0])

array([2863053, 1675488])

In [35]:
len(predYN_result[0])

4538541

In [36]:
sample_submission.head()

Unnamed: 0,ID,Click
0,TEST_0000000,0.733674
1,TEST_0000001,0.447662
2,TEST_0000002,0.369924
3,TEST_0000003,0.674909
4,TEST_0000004,0.719916


In [41]:
2863053/4538541

0.6308311415496742

In [42]:
1675488/4538541

0.36916885845032577