In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [2]:
##################################
# Path Variables
bucket='sagemaker-cifar10-2020'
folder = 'cifar-10-batches-py'

##################################
# Data Related Variables
num_file_train = 5
num_images_per_file = 10000
num_images_train = 50000
num_images_test = 10000
num_image_dim = 3072
##################################

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def load_class_names():
    key = 'batches.meta'
    data_location = './cifar-10-batches-py/{}'.format(key)
    class_names = unpickle(data_location).get(b'label_names')
    names = [x.decode('utf-8') for x in class_names]
    return names

# Read training data in from S3
def load_training():
    images = np.zeros(shape=[num_images_train, num_image_dim], dtype=float)
    cls = np.zeros(shape=[num_images_train], dtype=int)
    
    ind = 0
    
    for i in range(1, num_file_train + 1):
        data_key = 'data_batch_' + str(i)
        data_location = './cifar-10-batches-py/{}'.format(data_key)
        data = unpickle(data_location)
        num_images = data.get(b'data').shape[0]
        start = ind
        end = ind + num_images
        images[start:end, :] = data.get(b'data') / 255.0 # conver to 0-1 representation
        cls[start:end] = data.get(b'labels')
        
        ind += num_images
        
    return images, cls

# Read testing data in from S3
def load_testing():
    data_key = 'test_batch'
    data_location = './cifar-10-batches-py/{}'.format(data_key)
    data = unpickle(data_location)
    images = data.get(b'data') / 255.0 # conver to 0-1 representation
    cls = np.array(data.get(b'labels'))
    
    return images, cls

# General purpose image plotting
def plot_image(image):
    image_reshape = image.reshape(3, 32, 32).transpose(1,2,0)
    plt.imshow(image_reshape)

# Plot 5 random chosen images
def plot_random_images(images):
    num = 5
    fig, axes1 = plt.subplots(num, num, figsize=(8, 8))
    for j in range(num):
        for k in range(num):
            i = np.random.choice(range(images.shape[0]))
            image = images[i]
            image_reshape = image.reshape(3, 32, 32).transpose(1,2,0)
            axes1[j][k].set_axis_off()
            axes1[j][k].imshow(image_reshape)

In [17]:
train_images, train_class = load_training()
test_images, test_class = load_testing()

# pd.DataFrame(train_images).to_csv('train_images.csv', index=False, header=False)
# pd.DataFrame(train_class).to_csv('train_class.csv', index=False, header=False)

# pd.DataFrame(test_images).to_csv('test_images.csv', index=False, header=False)
# pd.DataFrame(test_class).to_csv('test_class.csv', index=False, header=False)

In [18]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

_, X_train, _, y_train = train_test_split(train_images, train_class, test_size=0.1, random_state=42, stratify=train_class)
del _, train_images, train_class, test_images, test_class


In [5]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb


clf_xgb = xgb.XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='multi:softmax',
                     num_class=10,
                     seed=27)

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [19]:
pca = PCA(n_components=10)
X = pca.fit_transform(X_train)

X = pd.DataFrame(X)
y = pd.DataFrame(y_train)

        
xgtrain = xgb.DMatrix(X.values, y.values)

In [42]:
params = {
    # Parameters that we are going to tune.
    'max_depth': 3,
    'min_child_weight': 1,
    'eta':.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    # Other parameters
    'objective':'multi:softmax',
    'num_class': 10,
    'nthread': -1,
    'seed' : 42,
    'eval_metric': ['merror', 'auc']
}

evallist = [(xgtrain, 'train')]
# Maximum number of levels in tree
max_depth = [3, 5, 7, 10]
# Minimum number of samples required to split a node
min_child_weight = [1, 3, 5]

num_boost_round = 1000

model = xgb.cv(
    params,
    xgtrain,
    num_boost_round=num_boost_round,
    nfold=2,
    seed=42,
    metrics={'merror'},
    early_stopping_rounds=2
)


In [45]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,4)
    for min_child_weight in range(1,2)
]

In [46]:
min_merror = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        xgtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].values.argmin()
    print("\tMERROR {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, MERROR: {}".format(best_params[0], best_params[1], min_merror))

CV with max_depth=3, min_child_weight=1
	MERROR 0.6674000000000001 for 19 rounds
Best params: 3, 1, MERROR: 0.6674000000000001


The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


In [50]:
cv_results.predict(xgtrain)

AttributeError: 'DataFrame' object has no attribute 'predict'