# Split Data into Train and Test Set

The extracted images collect from two datasets (fe_ckplus_kdef and mma) have been saved in the folder 'dataset'. This notebook randomly select images for train and test dataset. The images are saved in two folders: test and train. 

Datasets:
 - mma dataset: https://www.kaggle.com/mahmoudima/mma-facial-expression
 - fer_ckplus_kdef: https://www.kaggle.com/sudarshanvaidya/corrective-reannotation-of-fer-ck-kdef

In [1]:
import os
import pandas as pd
    
# get a list of file in the directory
folders = os.listdir('dataset')

paths = list()
img_names = list()
labels = list()

# get a list of image names in the folder 'dataset'
for folder in folders:
    path = 'dataset/' + folder + '/'
    file_names = os.listdir(path)
    
    for file_name in file_names:
        paths.append(path)
        img_names.append(file_name)
        labels.append(folder.split('_')[0])

# create a data frame that store image's name, path and label
df = pd.DataFrame({'path': paths, 'img_name': img_names, 'label': labels})
print(df.shape)
df.head()

(58922, 3)


Unnamed: 0,path,img_name,label
0,dataset/anger/,AF01ANS.png,anger
1,dataset/anger/,AF02ANS.png,anger
2,dataset/anger/,AF03ANS.png,anger
3,dataset/anger/,AF04ANS.png,anger
4,dataset/anger/,AF05ANS.png,anger


In [2]:
df['path'].value_counts()

dataset/happiness/         9049
dataset/surprise_mma/      8113
dataset/anger_mma/         6566
dataset/sadness/           5403
dataset/neutrality/        5072
dataset/fear_mma/          4859
dataset/anger/             4725
dataset/disgust_mma/       4542
dataset/surprise/          4226
dataset/fear/              3454
dataset/neutrality_mma/    1988
dataset/disgust/            795
dataset/contempt/           130
Name: path, dtype: int64

In [3]:
# desired sample size
n_sample = 5337

temp_dfs = list()
df2 = None

train_data = None
test_data = None

for path in list(df['path'].unique()):
    size = df[df['path'] == path].shape[0]
    
    if '_mma' not in path and 'contempt' not in path and size != n_sample:
        temp = df[df['path'] == path]
        
        # under sample classes that have more than 5337 images
        if size >= n_sample:
            temp = temp.sample(n=n_sample)
        
        # for classes that have less than 5337 images
        # randomly the remaining number of images from mma dataset
        if size < n_sample:
            path2 = path[:-1] + '_mma/'
            temp2 = df[df['path'] == path2]
            n_sample2 = n_sample - size
            
            temp2 = temp2.sample(n=n_sample2)
            temp = pd.concat([temp, temp2], axis=0)
        
        if df2 is not None:
            # concatenating df1 and df2 along rows
            df2 = pd.concat([df2, temp], axis=0).reset_index().drop(['index'], axis=1)
        else:
            df2 = temp

In [4]:
df2.shape

(37359, 3)

In [5]:
from sklearn.model_selection import train_test_split

y = df2['label']
X = df2.drop('label',axis=1)

# split data into 70% test and 30% train
x_train, x_test, y_train, y_test=train_test_split(X, y, train_size=0.70, test_size=0.30, 
                                                  stratify=y, random_state=123)

In [6]:
train = pd.concat([x_train, y_train], axis=1).reset_index().drop(['index'], axis=1)
test = pd.concat([x_test, y_test], axis=1).reset_index().drop(['index'], axis=1)

# save data frame as pickle file
train.to_pickle("data/train_data.pkl")
test.to_pickle("data/test_data.pkl")

In [8]:
from PIL import Image

paths = ['data/train/', 'data/test/']
data = [train, test]

for i, path in enumerate(paths):
    if not os.path.exists(path):
        os.makedirs(path)
    
    for index, row in data[i].iterrows():
        img_file = row['path'] + row['img_name']
        img = Image.open(img_file)
        
        # convert image to gray scale
        img = img.convert('L')
        
        # resize image to 224x224
        img = img.resize((224, 224), Image.ANTIALIAS)
        
        # save image into train/test directory under folder for specified label
        save_path = path + row['label'] + '/'
        
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        # save image
        img.save(save_path + row['img_name'], 'PNG')
        img.close()