In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import rankdata

from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

SEED = 2020

def seed_everything(SEED):
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

seed_everything(SEED)

In [64]:
train_df = pd.read_csv('../../TRIPLE S3FOLD/TSKFold_train.csv')
train_df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height,patient_code
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000,1
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053,2
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053,3
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000,4


In [65]:
FOLDS = 5

skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)

for fold,(idxT,idxV) in enumerate(skf.split(np.arange(15))):
    print(f"#### FOLD: {fold}")
    print(f"#### Train idx: {idxT}")
    print(f"#### Train idx: {idxV}\n")

#### FOLD: 0
#### Train idx: [ 0  3  4  5  6  7  8 10 11 12 13 14]
#### Train idx: [1 2 9]

#### FOLD: 1
#### Train idx: [ 0  1  2  3  5  6  7  8  9 10 12 14]
#### Train idx: [ 4 11 13]

#### FOLD: 2
#### Train idx: [ 0  1  2  3  4  6  8  9 10 11 12 13]
#### Train idx: [ 5  7 14]

#### FOLD: 3
#### Train idx: [ 0  1  2  3  4  5  7  8  9 11 13 14]
#### Train idx: [ 6 10 12]

#### FOLD: 4
#### Train idx: [ 1  2  4  5  6  7  9 10 11 12 13 14]
#### Train idx: [0 3 8]



In [66]:
PATH = 'AllSubmissions/'

os.listdir(PATH)

['1024-Series',
 '384-Series',
 '768-Series',
 'B7-Series',
 'Bhautik',
 'BS32 BCE',
 'BS64',
 'Colab SEED 18082020',
 'FocalLoss',
 'New-Seed',
 'Seed 420',
 'Single-Model',
 'Single-Model-2018-2020']

In [67]:
count = 0
for folder in os.listdir(PATH):
    folder += '/'
    for model in os.listdir(PATH+folder):
        print(PATH+folder+model)
        count += 1
print("\nTotal Submissions: ",count)

AllSubmissions/1024-Series/H2
AllSubmissions/1024-Series/H3
AllSubmissions/1024-Series/H4
AllSubmissions/384-Series/S1
AllSubmissions/384-Series/S2
AllSubmissions/384-Series/S3
AllSubmissions/384-Series/S4
AllSubmissions/384-Series/S5
AllSubmissions/384-Series/S6
AllSubmissions/384-Series/S7
AllSubmissions/768-Series/P1
AllSubmissions/768-Series/P1_1
AllSubmissions/768-Series/P2
AllSubmissions/768-Series/P3
AllSubmissions/768-Series/P4
AllSubmissions/768-Series/P5
AllSubmissions/768-Series/P6
AllSubmissions/768-Series/P7
AllSubmissions/B7-Series/E1
AllSubmissions/B7-Series/E2
AllSubmissions/B7-Series/E3
AllSubmissions/B7-Series/E4
AllSubmissions/B7-Series/E5
AllSubmissions/B7-Series/E6
AllSubmissions/Bhautik/S33_B5_384_0.9135
AllSubmissions/Bhautik/S36_B5_384_0.9151
AllSubmissions/Bhautik/S52_B5_384_0.9119
AllSubmissions/Bhautik/S55_B5_256_0.9029
AllSubmissions/Bhautik/S57_B5_384_0.9220
AllSubmissions/Bhautik/V1-9_B5_384_0.908
AllSubmissions/BS32 BCE/B0 128
AllSubmissions/BS32 BCE/B0 1

In [68]:
oofs = []
testmeans = []

for folder in os.listdir(PATH):
    folder += '/'
    if 'Bhautik' not in folder:
        for model in os.listdir(PATH+folder):
            model += '/'
#             print()
#             print(PATH+folder+model)
#             print()
            for filename in os.listdir(PATH+folder+model):
                if 'oof.csv' in filename or 'oof_' in filename:
#                     print("\t" + filename)
                    oof = pd.read_csv(PATH+folder+model+filename)
                    if oof.shape[0] == 32692:
                        oofs.append(oof)
                if 'submission' in filename or 'TESTPREDS_MEAN' in filename:
#                     print("\t" + filename)
                    testmean = pd.read_csv(PATH+folder+model+filename)
                    testmeans.append(testmean)

assert len(oofs)==len(testmeans),"Length of oof and testmean is not same"

In [74]:
oof_merged = None
testmean_merged = None
for i,df in enumerate(zip(oofs,testmeans)):
    oof, testmean = df
    if i==0:
        oof_merged = oof_merged = oof[['image_name','pred','target']]
        testmean_merged = testmean_merged = testmean
    else:
        oof_merged = oof_merged.merge(oof[['image_name','pred']],on='image_name',suffixes=(f'_{i}',f'_{i+1}'))
        testmean_merged = testmean_merged.merge(testmean,on='image_name',suffixes=(f'_{i}',f'_{i+1}'))

oof_merged.shape,testmean_merged.shape

((32692, 145), (10982, 144))

In [75]:
oof_merged.head()

Unnamed: 0,image_name,pred_1,target,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,...,pred_134,pred_135,pred_136,pred_137,pred_138,pred_139,pred_140,pred_141,pred_142,pred
0,ISIC_2637011,0.016478,0,0.00705,0.000806,0.119476,0.052763,0.126852,0.105021,0.058444,...,0.116084,0.163963,0.212014,0.090042,0.120128,0.161684,0.015931,0.238945,0.180453,0.189117
1,ISIC_0077735,0.017801,0,0.006755,0.008982,0.029501,0.027313,0.00726,0.00517,0.024703,...,0.026943,0.041004,0.093105,0.015465,0.037848,0.037847,0.040897,0.10763,0.045257,0.022969
2,ISIC_0082348,0.012165,0,0.002478,0.018119,0.068061,0.064136,0.032429,0.054077,0.089793,...,0.047519,0.03749,0.106432,0.082359,0.046037,0.038296,0.022035,0.201875,0.088861,0.081858
3,ISIC_0099474,0.010688,0,0.003871,0.00231,0.011365,0.020736,0.004257,0.014359,0.022587,...,0.039168,0.043445,0.095313,0.017875,0.044053,0.076389,0.012392,0.14014,0.005804,0.052264
4,ISIC_0076995,0.009003,0,0.003603,0.000539,0.01485,0.010314,0.012151,0.025851,0.011204,...,0.026276,0.068036,0.026732,0.022651,0.048181,0.062737,0.00729,0.032389,0.03638,0.030496


In [76]:
testmean_merged.head()

Unnamed: 0,image_name,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,...,target_134,target_135,target_136,target_137,target_138,target_139,target_140,target_141,target_142,target
0,ISIC_0052060,0.011757,0.009953,0.007127,0.024135,0.012282,0.033421,0.022,0.053995,0.027994,...,0.060337,0.029952,0.039425,0.022526,0.048809,0.045656,0.011883,0.044533,0.041965,0.060253
1,ISIC_0052349,0.003973,0.001314,0.002778,0.015675,0.008139,0.016353,0.031278,0.040116,0.021963,...,0.051461,0.048276,0.089119,0.01856,0.041281,0.044261,0.014446,0.053979,0.020867,0.038558
2,ISIC_0058510,0.007838,0.00044,0.004205,0.01057,0.011728,0.004131,0.006701,0.012446,0.008037,...,0.014313,0.014464,0.077029,0.019143,0.03369,0.027848,0.007056,0.053906,0.023969,0.017958
3,ISIC_0073313,0.010549,0.003661,0.015582,0.008411,0.021144,0.010612,0.010632,0.013248,0.011429,...,0.02133,0.012517,0.047223,0.015696,0.017512,0.025923,0.025608,0.043575,0.021752,0.023069
4,ISIC_0073502,0.052886,0.070886,0.078457,0.037389,0.023736,0.051038,0.043812,0.07809,0.087259,...,0.097138,0.129754,0.089629,0.070921,0.09505,0.09534,0.034653,0.044512,0.080858,0.071226


In [77]:
oof_merged = train_df[['image_name','tfrecord']].merge(oof_merged,on='image_name')
oof_merged.head()

Unnamed: 0,image_name,tfrecord,pred_1,target,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,...,pred_134,pred_135,pred_136,pred_137,pred_138,pred_139,pred_140,pred_141,pred_142,pred
0,ISIC_2637011,0,0.016478,0,0.00705,0.000806,0.119476,0.052763,0.126852,0.105021,...,0.116084,0.163963,0.212014,0.090042,0.120128,0.161684,0.015931,0.238945,0.180453,0.189117
1,ISIC_0015719,0,0.008521,0,0.001811,0.001913,0.005429,0.007493,0.002952,0.006083,...,0.009411,0.012234,0.054228,0.015042,0.02414,0.013983,0.011877,0.037139,0.00556,0.019959
2,ISIC_0052212,6,0.008936,0,0.004734,0.004446,0.015126,0.022872,0.028269,0.075438,...,0.035733,0.060238,0.056842,0.018822,0.066069,0.049965,0.013019,0.079763,0.052472,0.049973
3,ISIC_0068279,0,0.084264,0,0.109574,0.083558,0.101859,0.027982,0.073806,0.014562,...,0.09333,0.087733,0.144918,0.093726,0.096819,0.082433,0.037024,0.075306,0.07831,0.112341
4,ISIC_0074268,11,0.004107,0,0.000947,0.001187,0.005624,0.004659,0.003935,0.005131,...,0.007273,0.010701,0.056443,0.009375,0.014111,0.05155,0.007981,0.036434,0.00582,0.015967


In [79]:
column_names = list(oof_merged.columns)
column_names[-1] = 'pred_143'

oof_merged.columns = column_names
oof_merged.head()

Unnamed: 0,image_name,tfrecord,pred_1,target,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,...,pred_134,pred_135,pred_136,pred_137,pred_138,pred_139,pred_140,pred_141,pred_142,pred_143
0,ISIC_2637011,0,0.016478,0,0.00705,0.000806,0.119476,0.052763,0.126852,0.105021,...,0.116084,0.163963,0.212014,0.090042,0.120128,0.161684,0.015931,0.238945,0.180453,0.189117
1,ISIC_0015719,0,0.008521,0,0.001811,0.001913,0.005429,0.007493,0.002952,0.006083,...,0.009411,0.012234,0.054228,0.015042,0.02414,0.013983,0.011877,0.037139,0.00556,0.019959
2,ISIC_0052212,6,0.008936,0,0.004734,0.004446,0.015126,0.022872,0.028269,0.075438,...,0.035733,0.060238,0.056842,0.018822,0.066069,0.049965,0.013019,0.079763,0.052472,0.049973
3,ISIC_0068279,0,0.084264,0,0.109574,0.083558,0.101859,0.027982,0.073806,0.014562,...,0.09333,0.087733,0.144918,0.093726,0.096819,0.082433,0.037024,0.075306,0.07831,0.112341
4,ISIC_0074268,11,0.004107,0,0.000947,0.001187,0.005624,0.004659,0.003935,0.005131,...,0.007273,0.010701,0.056443,0.009375,0.014111,0.05155,0.007981,0.036434,0.00582,0.015967


In [80]:
column_names = list(testmean_merged.columns)
column_names[-1] = 'target_143'

testmean_merged.columns = column_names
testmean_merged.head()

Unnamed: 0,image_name,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,...,target_134,target_135,target_136,target_137,target_138,target_139,target_140,target_141,target_142,target_143
0,ISIC_0052060,0.011757,0.009953,0.007127,0.024135,0.012282,0.033421,0.022,0.053995,0.027994,...,0.060337,0.029952,0.039425,0.022526,0.048809,0.045656,0.011883,0.044533,0.041965,0.060253
1,ISIC_0052349,0.003973,0.001314,0.002778,0.015675,0.008139,0.016353,0.031278,0.040116,0.021963,...,0.051461,0.048276,0.089119,0.01856,0.041281,0.044261,0.014446,0.053979,0.020867,0.038558
2,ISIC_0058510,0.007838,0.00044,0.004205,0.01057,0.011728,0.004131,0.006701,0.012446,0.008037,...,0.014313,0.014464,0.077029,0.019143,0.03369,0.027848,0.007056,0.053906,0.023969,0.017958
3,ISIC_0073313,0.010549,0.003661,0.015582,0.008411,0.021144,0.010612,0.010632,0.013248,0.011429,...,0.02133,0.012517,0.047223,0.015696,0.017512,0.025923,0.025608,0.043575,0.021752,0.023069
4,ISIC_0073502,0.052886,0.070886,0.078457,0.037389,0.023736,0.051038,0.043812,0.07809,0.087259,...,0.097138,0.129754,0.089629,0.070921,0.09505,0.09534,0.034653,0.044512,0.080858,0.071226


In [81]:
oof_merged.to_csv('oof_base.csv',index=False)
testmean_merged.to_csv('test_base.csv',index=False)

In [62]:
FOLDS = 5

skf = KFold(n_splits=FOLDS,shuffle=True,random_state=2020)

cvScore = []
valpreds = []
valimagenames = []
valtargets = []

for fold,(idxT,idxV) in enumerate(skf.split(np.arange(15))):
    print(f"#### FOLD: {fold}")
    print(f"#### Train idx: {idxT}")
    print(f"#### Train idx: {idxV}\n")
    
    X_train, y_train = oof_merged[oof_merged.tfrecord.isin(idxT)][[column for column in oof_merged.columns if 'pred' in column]], oof_merged[oof_merged.tfrecord.isin(idxT)].target
    X_val, y_val = oof_merged[oof_merged.tfrecord.isin(idxV)][[column for column in oof_merged.columns if 'pred' in column]], oof_merged[oof_merged.tfrecord.isin(idxV)].target
    
    clf = LogisticRegression(penalty='l2',
                             dual=False,
                             tol=0.0001,
                             C=1e-6,
                             fit_intercept=False,
                             intercept_scaling=1,
                             class_weight=None,
                             random_state=SEED,
                             solver='lbfgs',
                             max_iter=50,
                             multi_class='auto',
                             verbose=0,
                             warm_start=False, 
                             n_jobs=-1,
                             l1_ratio=None)
    
#     clf = RandomForestClassifier(
#                                 n_estimators=100,
#                                 criterion='gini',
#                                 max_depth=5,
#                                 min_samples_split=5,
#                                 min_samples_leaf=1,
#                                 min_weight_fraction_leaf=0.01,
#                                 max_features='auto',
#                                 max_leaf_nodes=None,
#                                 min_impurity_decrease=0.0,
#                                 min_impurity_split=None,
#                                 bootstrap=True,
#                                 oob_score=False,
#                                 n_jobs=-1,
#                                 random_state=SEED,
#                                 verbose=0,
#                                 warm_start=False,
#                                 class_weight=None,
#                                 ccp_alpha=0.0,
#                                 max_samples=None)
    
    clf.fit(X_train,y_train)
    
#     print("#### Coef: ",clf.coef_)
    print("#### Train AUC: ",roc_auc_score(y_train,clf.predict_proba(X_train)[:,1]))
    
    pred = clf.predict_proba(X_val)[:,1]
    valpreds.append(pred)
    valimagenames.append(oof_merged[oof_merged.tfrecord.isin(idxV)].image_name.values)
    valtargets.append(y_val)
    
    valauc = roc_auc_score(y_val,pred)
    cvScore.append(valauc)
    print("#### Val AUC: ",valauc)
    
    print()

valtargets = np.concatenate(valtargets)
valpreds = np.concatenate(valpreds)
auc = roc_auc_score(valtargets,valpreds)
print(f"#### CV: {auc}")

# if best_auc<auc:
#     print("Score improved!!")
#     best_auc = auc
# else:
#     print("No improvement!!")
#     print("Best AUC: ",best_auc)

#### FOLD: 0
#### Train idx: [ 0  3  4  5  6  7  8 10 11 12 13 14]
#### Train idx: [1 2 9]

#### Train AUC:  0.058075993035109706
#### Val AUC:  0.9494940029985008

#### FOLD: 1
#### Train idx: [ 0  1  2  3  5  6  7  8  9 10 12 14]
#### Train idx: [ 4 11 13]

#### Train AUC:  0.05374302171542575
#### Val AUC:  0.9312213338208487

#### FOLD: 2
#### Train idx: [ 0  1  2  3  4  6  8  9 10 11 12 13]
#### Train idx: [ 5  7 14]

#### Train AUC:  0.05887719985076399
#### Val AUC:  0.9529652048877951

#### FOLD: 3
#### Train idx: [ 0  1  2  3  4  5  7  8  9 11 13 14]
#### Train idx: [ 6 10 12]

#### Train AUC:  0.05417721837001306
#### Val AUC:  0.9338870449707505

#### FOLD: 4
#### Train idx: [ 1  2  4  5  6  7  9 10 11 12 13 14]
#### Train idx: [0 3 8]

#### Train AUC:  0.05783961899417792
#### Val AUC:  0.9484585049653543

#### CV: 0.9434690585705533


In [82]:
valtargets

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [84]:
oof = pd.DataFrame(dict(
#         image_name = valimagenames,
        pred = valpreds,
        target = valtargets
    ))