In [None]:
import numpy as np
np.random.seed(1337)

import glob
import cv2
import datetime
import pandas as pd
import time
import scipy
import warnings
import shutil
warnings.filterwarnings("ignore")
from scipy import misc
import matplotlib.pyplot as plt
%matplotlib inline

from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from various_utils_general import *

In [None]:
def split_proper_orig(test_size):
    print('Validation set size:', test_size)
    img_names = []
    classes = []
    for i in train_ids:
        img_names.append(i[:6] + '/' + i.split('_')[1][2:])
        classes.append(i[:6])
    img_names = list(set(img_names))
    classes = pd.Series(img_names).apply(lambda x: x[:6])
    train_split, val_split = train_test_split(img_names, test_size = test_size, stratify = classes,
                                              random_state = 111)
    to_train = []
    to_val = []
    for i, img in enumerate(train_ids):
        orig = img[:6] + '/' + img.split('_')[1][2:]
        if orig in train_split:
            to_train.append(img)
        if orig in val_split:
            to_val.append(img)
    to_train = list(set(to_train))
    to_val = list(set(to_val))
    assert (len(list(set(to_train).intersection(set(to_val))))) == 0
    print('Number of training set images: {}, validation set images: {}'.format(len(to_train), len(to_val)))
    return to_train, to_val

def save_splits(tr, val, dst, savename):
    for i in range(len(tr)):
        with open('{}/train_{}.txt'.format(dst, savename), 'a') as out:
            out.write(tr[i] + '\n')
    for i in range(len(val)):
        with open('{}/valid_{}.txt'.format(dst, savename), 'a') as out:
            out.write(val[i] + '\n')
    return

def split_dataset2(trfile, valfile, full_path, train_path, valid_path):
    
    flds = ['Type_1', 'Type_2', 'Type_3']
    for i in flds:
        if i not in os.listdir(train_path):
            try:
                os.mkdir(i)
            except Exception as e:
                print(e)
        if i not in os.listdir(valid_path):
            try:
                os.mkdir(i)
            except Exception as e:
                print(e)
            
    for line in trfile:
        cols = line.split('/')
        src = "{}/{}/{}".format(full_path, cols[0], cols[1])
        dst = "{}/{}/{}".format(train_path, cols[0], cols[1])
        shutil.copy(src, dst)
    
    for line in valfile:
        cols = line.split('/')
        src = "{}/{}/{}".format(full_path, cols[0], cols[1])
        dst = "{}/{}/{}".format(valid_path, cols[0], cols[1])
        shutil.copy(src, dst)
    return

def split_proper_skf(train_ids, num_folds):
    folds_train_imgs = []
    folds_val_imgs = []
    folds_train_inds = []
    folds_val_inds = [] 
    img_names = []
    
    for i in train_ids:
        img_names.append(i[:6] + '/' + i.split('_')[1][2:])
    img_names = list(set(img_names))
    img_names = np.array(img_names)
    train_ids = np.array(train_ids)
    classes = pd.Series(img_names).apply(lambda x: x[:6])
    skf = StratifiedKFold(n_splits = num_folds, random_state = 111, shuffle = True)
    print('Running {}-Fold data split'.format(num_folds))
    fold_number = 1
    for train_index, test_index in skf.split(img_names, classes):
        print('Split dataset for fold:', fold_number)
        train_split, val_split = img_names[train_index], img_names[test_index]
        to_train = []
        to_val = []
        for i, img in enumerate(train_ids):
            orig = img[:6] + '/' + img.split('_')[1][2:]
            if orig in train_split:
                to_train.append(img)
            if orig in val_split:
                to_val.append(img)
        to_train = list(set(to_train))
        to_val = list(set(to_val))
        assert (len(list(set(to_train).intersection(set(to_val))))) == 0
        print('Number of training set images: {}, validation set images: {}'.format(len(to_train), len(to_val)))
        folds_train_imgs.append(to_train)
        folds_val_imgs.append(to_val)
        
        inds_train = []
        inds_val = []
        for i, val in enumerate(train_ids):
            for j in to_train:
                if j in val:
                    inds_train.append(i)
        inds_val = list(set(range(len(train_ids))).difference(set(inds_train)))
        folds_train_inds.append(inds_train)
        folds_val_inds.append(inds_val)
        fold_number += 1
        
    return folds_train_imgs, folds_val_imgs, folds_train_inds, folds_val_inds

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/data/training_data/'
save_dst = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Intel_Cervix/scripts/various/'

full_train = src + 'train_crops_vgg_299_oversampled'
train_path = src + 'train_set'
val_path = src + 'valid_set'


In [None]:
X, y, train_ids = load_train(full_train)

In [None]:
tr, val = split_proper_orig(0.15)
#split_dataset2(tr, val, full_train, train_path, val_path)

In [None]:
tr_imgs, val_imgs, tr_inds, val_inds = split_proper_skf(train_ids, 5)
tr1 = pd.Series(tr_imgs[0]).apply(lambda x: x[:6])
tr1.value_counts()

In [None]:
v1 = pd.Series(val_imgs[0]).apply(lambda x: x[:6])
v1.value_counts()