============================================================================================

# Prepare Color Histogram & Sizes Data For KNN


============================================================================================

In [2]:
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)

In [39]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from subprocess import check_output
print (check_output(["ls", "data"]).decode("utf-8"))


athenaeum_authors_preview.csv
athenaeum_painting_filtered.csv
athenaeum_paintings.csv
athenaeum_paintings_sizes.csv
color_hist_kmeans_206552.csv
color_histograms.csv
color_hist_size_206552.csv
complete_data.csv
extra_tree_com.csv
grad_boost_com.csv
images
images_athenaeum
images_sizes_2325.csv
nbc_com.csv
net_predicted.csv
painter_info_clean.csv
painting_info_clean.csv
pca_transformed_test_20.csv
pca_transformed_train_20.csv
resized_200
rf_com.csv
test_author200.csv
test_data.csv
test_hist_author_knn.csv
test_hist_author_rf.csv
train_author200.csv
train_data.csv
train_hist_author_knn.csv
train_hist_author_rf.csv
xgb_com.csv



In [3]:
test_author200 = pd.read_csv('data/test_author200.csv')
train_author200 = pd.read_csv('data/train_author200.csv')
color_hist = pd.read_csv('data/color_histograms.csv')

print "[INFO] The size of test data: " + str(test_author200.shape)
print "[INFO] The size of train data: " + str(train_author200.shape)
print "[INFO] The size of color histogram: " + str(color_hist.shape)
train_author200.head(3)

[INFO] The size of test data: (12473, 12)
[INFO] The size of train data: (49890, 12)
[INFO] The size of color histogram: (206550, 32)


Unnamed: 0,medium,painting_location,height,article_type,painting_url,painting_dates,painting_title,author_id,painting_id,width,height_px,width_px
0,oil on canvas,Munch-museet (Norway - Oslo),90.0,Painting,http://www.the-athenaeum.org/art/display_image...,1907-1908,Mason and Mechanic,1793,52573,69.5,722,534
1,oil on canvas,"Royal Botanic Gardens, Kew (United Kingdom - ...",107.0,Painting,http://www.the-athenaeum.org/art/display_image...,circa 1880,A South African Sedge,9266,207890,45.0,944,384
2,oil on canvas,Watts Gallery - Compton (Surrey) (United King...,137.0,Painting,http://www.the-athenaeum.org/art/display_image...,1849-1850,Under the Dry Arch,502,214715,101.5,944,677


In [4]:
train_subset = train_author200[['painting_id', 'height_px', 'width_px']]
test_subset = test_author200[['painting_id', 'height_px', 'width_px']]
train_subset.head(2)

Unnamed: 0,painting_id,height_px,width_px
0,52573,722,534
1,207890,944,384


In [5]:
train_hist = color_hist.merge(train_subset, how = 'right', on = 'painting_id')
print "[INFO] The size of train histogram for KNN" + str(train_hist.shape)
train_hist.head(2)

[INFO] The size of train histogram for KNN(49890, 34)


Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px
0,444,11653,141479,107241,3576,1031,1010,3886,568,2650,...,144006,99116,46843,25371,63071,67621,69665,480656,742,952
1,444,12097,1429,713,600,581,1362,3304,8280,16148,...,174675,107953,51343,77403,128000,147619,172075,150541,941,718


In [6]:
test_hist = color_hist.merge(test_subset, how = 'right', on = 'painting_id')
print "[INFO] The size of test histogram for KNN" + str(test_hist.shape)
test_hist.head(2)

[INFO] The size of test histogram for KNN(12473, 34)


Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px
0,444,12077,7603,1687,587,348,403,538,671,959,...,99139,69539,24143,46874,97427,117568,102332,79799,555,800
1,444,87820,18868,4502,1403,696,672,763,857,1430,...,352276,123167,17879,17396,139326,225701,318966,413491,871,1280


In [9]:
# prepare width & height ratio
train_hist['height_width_ratio'] = train_hist['height_px'] / train_hist['width_px']
test_hist['height_width_ratio'] = test_hist['height_px'] / test_hist['width_px']

In [11]:
test_hist.head(2)

Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px,height_width_ratio
0,444,12077,7603,1687,587,348,403,538,671,959,...,69539,24143,46874,97427,117568,102332,79799,555,800,0.69375
1,444,87820,18868,4502,1403,696,672,763,857,1430,...,123167,17879,17396,139326,225701,318966,413491,871,1280,0.680469


In [42]:
# Random Forest color histogram Dataset
train_hist.to_csv("data/train_hist_author_rf.csv", index=False)
test_hist.to_csv("data/test_hist_author_rf.csv", index=False)

In [36]:
# Scaling height_px, width_px for KNN
train_hist.iloc[:, -2] = train_hist.iloc[:, -2] / train_hist.height_px.max()
train_hist.iloc[:, -1] = train_hist.iloc[:, -1] / train_hist.width_px.max()

test_hist.iloc[:, -2] = test_hist.iloc[:, -2] / train_hist.height_px.max()
test_hist.iloc[:, -1] = test_hist.iloc[:, -1] / train_hist.width_px.max()


In [12]:
# KNN color histogram Dataset
train_hist.to_csv("data/train_hist_author_knn.csv", index=False)
test_hist.to_csv("data/test_hist_author_knn.csv", index=False)

============================================================================================

# Prepare Data For Random Forest

============================================================================================

In [29]:
test_author200 = pd.read_csv('data/test_author200.csv')
train_author200 = pd.read_csv('data/train_author200.csv')
color_hist = pd.read_csv('data/color_histograms.csv')

print "[INFO] The size of test data: " + str(test_author200.shape)
print "[INFO] The size of train data: " + str(train_author200.shape)
print "[INFO] The size of color histogram: " + str(color_hist.shape)
train_author200.head(3)

[INFO] The size of test data: (12473, 12)
[INFO] The size of train data: (49890, 12)
[INFO] The size of color histogram: (206550, 32)


Unnamed: 0,medium,painting_location,height,article_type,painting_url,painting_dates,painting_title,author_id,painting_id,width,height_px,width_px
0,oil on canvas,Munch-museet (Norway - Oslo),90.0,Painting,http://www.the-athenaeum.org/art/display_image...,1907-1908,Mason and Mechanic,1793,52573,69.5,722,534
1,oil on canvas,"Royal Botanic Gardens, Kew (United Kingdom - ...",107.0,Painting,http://www.the-athenaeum.org/art/display_image...,circa 1880,A South African Sedge,9266,207890,45.0,944,384
2,oil on canvas,Watts Gallery - Compton (Surrey) (United King...,137.0,Painting,http://www.the-athenaeum.org/art/display_image...,1849-1850,Under the Dry Arch,502,214715,101.5,944,677


In [30]:
train_subset = train_author200[['painting_id', 'height_px', 'width_px']]
test_subset = test_author200[['painting_id', 'height_px', 'width_px']]
train_hist = color_hist.merge(train_subset, how = 'right', on = 'painting_id')
test_hist = color_hist.merge(test_subset, how = 'right', on = 'painting_id')


In [None]:
train_hist.to_csv("data/train_hist_author_knn.csv", index=False)
test_hist.to_csv("data/test_hist_author_knn.csv", index=False)

In [31]:
test_hist.head(1)

Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px
0,444,12077,7603,1687,587,348,403,538,671,959,...,99139,69539,24143,46874,97427,117568,102332,79799,555,800


============================================================================================

# Prepare Data For KMeans

============================================================================================

In [4]:
color_hist = pd.read_csv('data/color_histograms.csv')
img_sizes = pd.read_csv('data/athenaeum_paintings_sizes.csv')

print "[INFO] The size of color histogram: " + str(color_hist.shape)
print "[INFO] The size of image sizes file: " + str(img_sizes.shape)


[INFO] The size of color histogram: (206550, 32)
[INFO] The size of image sizes file: (207245, 12)


In [14]:
img_sizes.columns

Index([u'medium', u'painting_location', u'height', u'article_type',
       u'painting_url', u'painting_dates', u'painting_title', u'author_id',
       u'painting_id', u'width', u'height_px', u'width_px'],
      dtype='object')

In [21]:
sub_sizes = img_sizes[['painting_id','height_px', 'width_px']]
sub_sizes.head(1)

Unnamed: 0,painting_id,height_px,width_px
0,104839,842.0,1280.0


In [25]:
color_hist_size_206552 = color_hist.merge(sub_sizes, how='inner', on='painting_id')
color_hist_size_206552.shape

(206552, 34)

In [28]:
color_hist_size_206552['height_width_ratio'] = color_hist_size_206552['height_px'] / color_hist_size_206552['width_px']

In [29]:
color_hist_size_206552.head(1)

Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px,height_width_ratio
0,444,12077,7603,1687,587,348,403,538,671,959,...,69539,24143,46874,97427,117568,102332,79799,555.0,800.0,0.69375


In [30]:
color_hist_size_206552.to_csv('data/color_hist_size_206552.csv', index=False)

============================================================================================

# Prepare Data 3rd pca + kmeans + NN

============================================================================================

In [32]:
test_author200 = pd.read_csv('data/test_author200.csv')
train_author200 = pd.read_csv('data/train_author200.csv')
color_hist_kmeans = pd.read_csv('data/color_hist_kmeans_206552.csv')

print "[INFO] The size of test data: " + str(test_author200.shape)
print "[INFO] The size of train data: " + str(train_author200.shape)
print "[INFO] The size of color histogram: " + str(color_hist_kmeans.shape)
color_hist_kmeans.head(3)

[INFO] The size of test data: (12473, 12)
[INFO] The size of train data: (49890, 12)
[INFO] The size of color histogram: (206552, 36)


Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px,height_width_ratio,kmeans_labels
0,444,12077,7603,1687,587,348,403,538,671,959,...,24143,46874,97427,117568,102332,79799,555.0,800.0,0.69375,6
1,444,11653,141479,107241,3576,1031,1010,3886,568,2650,...,46843,25371,63071,67621,69665,480656,742.0,952.0,0.779412,0
2,444,12097,1429,713,600,581,1362,3304,8280,16148,...,51343,77403,128000,147619,172075,150541,941.0,718.0,1.310585,6


In [86]:
train_sub = pd.DataFrame(train_author200['painting_id'])
train_hist = color_hist_kmeans.merge(train_sub, how = 'inner', on = 'painting_id')
print "[INFO] The size of 3rd train set " + str(train_hist.shape)
train_hist.head(2)

[INFO] The size of 3rd train set (49890, 36)


Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px,height_width_ratio,kmeans_labels
0,444,11653,141479,107241,3576,1031,1010,3886,568,2650,...,46843,25371,63071,67621,69665,480656,742.0,952.0,0.779412,0
1,444,12097,1429,713,600,581,1362,3304,8280,16148,...,51343,77403,128000,147619,172075,150541,941.0,718.0,1.310585,6


In [87]:
test_sub = pd.DataFrame(test_author200['painting_id'])
test_hist = color_hist_kmeans.merge(test_sub, how = 'inner', on = 'painting_id')
print "[INFO] The size of 3rd test set " + str(test_hist.shape)
test_hist.head(2)

[INFO] The size of 3rd test set (12473, 36)


Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_px,width_px,height_width_ratio,kmeans_labels
0,444,12077,7603,1687,587,348,403,538,671,959,...,24143,46874,97427,117568,102332,79799,555.0,800.0,0.69375,6
1,444,87820,18868,4502,1403,696,672,763,857,1430,...,17879,17396,139326,225701,318966,413491,871.0,1280.0,0.680469,0


In [88]:
train_hist = train_hist.drop(['height_px', 'width_px', 'kmeans_labels'], axis=1)
test_hist = test_hist.drop(['height_px', 'width_px', 'kmeans_labels'], axis=1)
test_hist.head(1)

Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_22,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_width_ratio
0,444,12077,7603,1687,587,348,403,538,671,959,...,83566,99139,69539,24143,46874,97427,117568,102332,79799,0.69375


In [100]:
## Get 15 principal components
pca = PCA(n_components=20)
pca.fit(train_hist.iloc[:, 2:])
pca_transformed_train = pd.DataFrame(pca.transform(train_hist.iloc[:, 2:]))
pca_transformed_test = pd.DataFrame(pca.transform(test_hist.iloc[:, 2:]))
pca_transformed_test.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-386277.953668,-45186.81012,-26079.142204,-38646.498522,35962.104708,-7930.620972,-64150.202933,-21410.705819,-21971.626999,-20419.46059,17066.833942,7246.74219,1269.983856,39346.493735,-2047.62402,-588.127104,14072.253562,2401.983214,17298.894825,-1009.90643


In [101]:
columns_name = ["pca_%02d" % i for i in range(pca_transformed.shape[1])]
len(columns_name)

20

In [105]:
pca_transformed_train.columns = columns_name
pca_transformed_test.columns = columns_name

pca_transformed_test.head(1)

Unnamed: 0,pca_00,pca_01,pca_02,pca_03,pca_04,pca_05,pca_06,pca_07,pca_08,pca_09,pca_10,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19
0,-386277.953668,-45186.81012,-26079.142204,-38646.498522,35962.104708,-7930.620972,-64150.202933,-21410.705819,-21971.626999,-20419.46059,17066.833942,7246.74219,1269.983856,39346.493735,-2047.62402,-588.127104,14072.253562,2401.983214,17298.894825,-1009.90643


In [106]:
pca_transformed_train['kmeans_labels'] = color_hist_kmeans['kmeans_labels']
pca_transformed_test['kmeans_labels'] = color_hist_kmeans['kmeans_labels']
pca_transformed_test.head(1)

Unnamed: 0,pca_00,pca_01,pca_02,pca_03,pca_04,pca_05,pca_06,pca_07,pca_08,pca_09,...,pca_11,pca_12,pca_13,pca_14,pca_15,pca_16,pca_17,pca_18,pca_19,kmeans_labels
0,-386277.953668,-45186.81012,-26079.142204,-38646.498522,35962.104708,-7930.620972,-64150.202933,-21410.705819,-21971.626999,-20419.46059,...,7246.74219,1269.983856,39346.493735,-2047.62402,-588.127104,14072.253562,2401.983214,17298.894825,-1009.90643,6


In [111]:
train_hist.head(1)

Unnamed: 0,author_id,painting_id,hist_01,hist_02,hist_03,hist_04,hist_05,hist_06,hist_07,hist_08,...,hist_22,hist_23,hist_24,hist_25,hist_26,hist_27,hist_28,hist_29,hist_30,height_width_ratio
0,444,11653,141479,107241,3576,1031,1010,3886,568,2650,...,117472,144006,99116,46843,25371,63071,67621,69665,480656,0.779412


In [114]:
pca_kmean_train = pd.concat([train_hist.reset_index(drop=True), pca_transformed_train], axis=1 )
print pca_kmean_train.shape
pca_kmean_train.columns

(49890, 54)


Index([u'author_id', u'painting_id', u'hist_01', u'hist_02', u'hist_03',
       u'hist_04', u'hist_05', u'hist_06', u'hist_07', u'hist_08', u'hist_09',
       u'hist_10', u'hist_11', u'hist_12', u'hist_13', u'hist_14', u'hist_15',
       u'hist_16', u'hist_17', u'hist_18', u'hist_19', u'hist_20', u'hist_21',
       u'hist_22', u'hist_23', u'hist_24', u'hist_25', u'hist_26', u'hist_27',
       u'hist_28', u'hist_29', u'hist_30', u'height_width_ratio', u'pca_00',
       u'pca_01', u'pca_02', u'pca_03', u'pca_04', u'pca_05', u'pca_06',
       u'pca_07', u'pca_08', u'pca_09', u'pca_10', u'pca_11', u'pca_12',
       u'pca_13', u'pca_14', u'pca_15', u'pca_16', u'pca_17', u'pca_18',
       u'pca_19', u'kmeans_labels'],
      dtype='object')

In [115]:
pca_kmean_test = pd.concat([test_hist.reset_index(drop=True), pca_transformed_test], axis=1 )
print pca_kmean_test.shape
pca_kmean_test.columns

(12473, 54)


Index([u'author_id', u'painting_id', u'hist_01', u'hist_02', u'hist_03',
       u'hist_04', u'hist_05', u'hist_06', u'hist_07', u'hist_08', u'hist_09',
       u'hist_10', u'hist_11', u'hist_12', u'hist_13', u'hist_14', u'hist_15',
       u'hist_16', u'hist_17', u'hist_18', u'hist_19', u'hist_20', u'hist_21',
       u'hist_22', u'hist_23', u'hist_24', u'hist_25', u'hist_26', u'hist_27',
       u'hist_28', u'hist_29', u'hist_30', u'height_width_ratio', u'pca_00',
       u'pca_01', u'pca_02', u'pca_03', u'pca_04', u'pca_05', u'pca_06',
       u'pca_07', u'pca_08', u'pca_09', u'pca_10', u'pca_11', u'pca_12',
       u'pca_13', u'pca_14', u'pca_15', u'pca_16', u'pca_17', u'pca_18',
       u'pca_19', u'kmeans_labels'],
      dtype='object')

In [116]:
pd.DataFrame(pca_kmean_train).to_csv('data/pca20_kmeans_train.csv', index=False)
pd.DataFrame(pca_kmean_test).to_csv('data/pca20_kmeans_test.csv', index=False)


In [117]:
random_forest = ['random_forest', 0.686449060336, 0.668644906034]
xgboost = ['xgboost', 0.79440870856, 0.71315529179]
naive_bayes = ['naive_bayes', 0.59940652819, 0.623145400593]
extra_tree = ['extra_tree', 0.673590504451, 0.660731948566]
knn = ['knn', 0.705838693716, ]
model_accuracy = pd.DataFrame([random_forest, xgboost, naive_bayes, extra_tree], columns=['model_name', 'color_hist', 'pca'])

In [118]:
model_accuracy.to_csv('data/model_accuracy.csv', index=False)