# Data

In [1]:
# load the packages
from pyimagesearch import config
from pyimagesearch import top
from pyimagesearch.resnet import ResNet
from imutils import paths
import random
import shutil
import os
import cv2
import re
from itertools import chain

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler
from keras.optimizers import SGD
from keras import optimizers
from keras.applications import VGG16
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

sns.set_style('white')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
SEED = 42

Using TensorFlow backend.


## Load Item Data

In [2]:
# import item
items = pd.read_csv("data/Taobao Clothes Matching Data/dim_items（new).txt", sep='delimiter', header=None)
items = items[0].apply(lambda x: x.split())
item_df = pd.DataFrame()
item_df['item_id'] = items.apply(lambda x: x[0])
item_df['cat_id'] = items.apply(lambda x: x[1])
item_df['terms'] = items.apply(lambda x: x[2:])
item_df.head()

Unnamed: 0,item_id,cat_id,terms
0,29,155,"[123950,53517,106068,59598,7503,171811,25618,1..."
1,49,228,"[73035,33202,116593,48909,92233,181255,127004,..."
2,59,284,"[123950,38910,22837,5026,15459,47776,158346,10..."
3,109,461,"[122071,35420,123950,27207,116593,24893,31897,..."
4,119,368,"[48909,125706,116593,179606,20819,158346,15722..."


In [3]:
item_df.shape

(499983, 3)

In [4]:
# number of categories
item_df.cat_id.nunique()

281

In [5]:
top_cat = item_df.cat_id.value_counts()[:15]
top_cat

368    59380
52     41859
284    32080
461    28388
111    27604
505    19985
48     19348
155    17720
228    17617
160    15930
137    15735
33     15638
42     15003
516    13285
50     12451
Name: cat_id, dtype: int64

In [6]:
top_item_df = item_df.loc[item_df.cat_id.isin(top_cat.index), ['item_id', 'cat_id']]
top_item_df.shape

(352023, 2)

In [7]:
top_item = list(top_item_df.item_id)

## Filter Image

Filter for items in the top 20 categories.

In [None]:
# initialize the path to the *original* input directory of images
ORIG_INPUT_DATASET = 'data/tianchi_fm_img2_1'

# initialize the base path to the *new* directory that will contain our images after computing the training and testing split
BASE_PATH = 'data'

# filter path
FILTER_PATH = os.path.sep.join([BASE_PATH, 'filter'])

# create the directory
if not os.path.exists(top.FILTER_PATH):
        print("[INFO] 'creating {}' directory".format(top.FILTER_PATH))
        os.makedirs(top.FILTER_PATH)

In [None]:
# loop over the input image paths
imagePaths = list(paths.list_images(config.ORIG_INPUT_DATASET))

for inputPath in imagePaths:

    # extract the filename of the input image along with its corresponding class label
    filename = inputPath.split(os.path.sep)[-1]
    file_id = filename.split('.')[0]
    if file_id in top_item:
        label = top_item_df.loc[top_item_df.item_id==file_id, 'cat_id'].values[0]
        # build the path to the label directory
        labelPath = os.path.sep.join([top.FILTER_PATH, label])

        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print("[INFO] 'creating {}' directory".format(labelPath))
            os.makedirs(labelPath)

        # construct the path to the destination image and then copy the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

## Train, Val, Test Split

In [None]:
# initialize the path to the *original* input directory of images
ORIG_INPUT_DATASET = 'data/filter'

# initialize the base path to the *new* directory that will contain our images after computing the training and testing split
BASE_PATH = 'data'

# derive the training, validation, and testing directories
TRAIN_PATH = os.path.sep.join([BASE_PATH, 'training'])
VAL_PATH = os.path.sep.join([BASE_PATH, 'validation'])
TEST_PATH = os.path.sep.join([BASE_PATH, 'testing'])

# define the amount of data that will be used training
TRAIN_SPLIT = 0.8

# the amount of validation data will be a percentage of the *training* data
VAL_SPLIT = 0.1

In [None]:
# grab the paths to all input images in the original input directory and shuffle them
imagePaths = list(paths.list_images(config.ORIG_INPUT_DATASET))
random.seed(42)
random.shuffle(imagePaths)

In [None]:
# compute the training and testing split
i = int(len(imagePaths) * config.TRAIN_SPLIT)
trainPaths = imagePaths[:i] # 80% of the data
testPaths = imagePaths[i:] # 20% of the data

# we'll be using part of the training data for validation
i = int(len(trainPaths) * config.VAL_SPLIT)
valPaths = trainPaths[:i] # 8% of the data
trainPaths = trainPaths[i:] #72% of the data

In [None]:
# define the datasets that we'll be building
datasets = [
    ('training', trainPaths, config.TRAIN_PATH),
    ('validation', valPaths, config.VAL_PATH),
    ('testing', testPaths, config.TEST_PATH)
]

In [None]:
# loop over the datasets
for (dType, imagePaths, baseOutput) in datasets:
    
    # show which data split we are creating
    print("[INFO] building '{}' split".format(dType))
    
    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print("[INFO] 'creating {}' directory".format(baseOutput))
        os.makedirs(baseOutput)
        
    # loop over the input image paths
    for inputPath in imagePaths:
        
        # extract the filename of the input image along with its corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]

        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])

        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print("[INFO] 'creating {}' directory".format(labelPath))
            os.makedirs(labelPath)
            
        # construct the path to the destination image and then copy the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

In [None]:
# determine the total number of image paths in training, validation, and testing directories
totalTrain = len(list(paths.list_images(config.TRAIN_PATH)))
totalVal = len(list(paths.list_images(config.VAL_PATH)))
totalTest = len(list(paths.list_images(config.TEST_PATH)))

print(totalTrain)
print(totalVal)
print(totalTest)