In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import time

In [2]:
artworks = pd.read_csv('https://media.githubusercontent.com/media/MuseumofModernArt/collection/master/Artworks.csv')
artworks.columns

Index(['Title', 'Artist', 'ConstituentID', 'ArtistBio', 'Nationality',
       'BeginDate', 'EndDate', 'Gender', 'Date', 'Medium', 'Dimensions',
       'CreditLine', 'AccessionNumber', 'Classification', 'Department',
       'DateAcquired', 'Cataloged', 'ObjectID', 'URL', 'ThumbnailURL',
       'Circumference (cm)', 'Depth (cm)', 'Diameter (cm)', 'Height (cm)',
       'Length (cm)', 'Weight (kg)', 'Width (cm)', 'Seat Height (cm)',
       'Duration (sec.)'],
      dtype='object')

In [3]:
# Select Columns.
artworks = artworks[['Artist', 'Nationality', 'Gender', 'Date', 'Department',
                    'DateAcquired', 'URL', 'ThumbnailURL', 'Height (cm)', 'Width (cm)']]

# Convert URL's to booleans.
artworks['URL'] = artworks['URL'].notnull()
artworks['ThumbnailURL'] = artworks['ThumbnailURL'].notnull()

# Drop films and some other tricky rows.
artworks = artworks[artworks['Department']!='Film']
artworks = artworks[artworks['Department']!='Media and Performance Art']
artworks = artworks[artworks['Department']!='Fluxus Collection']

# Drop missing data.
artworks = artworks.dropna()

In [4]:
artworks.head()

Unnamed: 0,Artist,Nationality,Gender,Date,Department,DateAcquired,URL,ThumbnailURL,Height (cm),Width (cm)
0,Otto Wagner,(Austrian),(Male),1896,Architecture & Design,1996-04-09,True,True,48.6,168.9
1,Christian de Portzamparc,(French),(Male),1987,Architecture & Design,1995-01-17,True,True,40.6401,29.8451
2,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,34.3,31.8
3,Bernard Tschumi,(),(Male),1980,Architecture & Design,1995-01-17,True,True,50.8,50.8
4,Emil Hoppe,(Austrian),(Male),1903,Architecture & Design,1997-01-15,True,True,38.4,19.1


In [5]:
artworks.shape

(108413, 10)

In [6]:
artworks.dtypes

Artist           object
Nationality      object
Gender           object
Date             object
Department       object
DateAcquired     object
URL                bool
ThumbnailURL       bool
Height (cm)     float64
Width (cm)      float64
dtype: object

In [7]:
artworks['DateAcquired'] = pd.to_datetime(artworks.DateAcquired)
artworks['YearAcquired'] = artworks.DateAcquired.dt.year
artworks['YearAcquired'].dtype

dtype('int64')

In [8]:
# Remove multiple nationalities, genders, and artists.
artworks.loc[artworks['Gender'].str.contains('\) \('), 'Gender'] = '\(multiple_persons\)'
artworks.loc[artworks['Nationality'].str.contains('\) \('), 'Nationality'] = '\(multiple_nationalities\)'
artworks.loc[artworks['Artist'].str.contains(','), 'Artist'] = 'Multiple_Artists'

# Convert dates to start date, cutting down number of distinct examples.
artworks['Date'] = pd.Series(artworks.Date.str.extract(
    '([0-9]{4})', expand=False))[:-1]

# Final column drops and NA drop.
X = artworks.drop(['Department', 'DateAcquired', 'Artist', 'Nationality', 'Date'], 1)

# Create dummies separately.
artists = pd.get_dummies(artworks.Artist)
nationalities = pd.get_dummies(artworks.Nationality)
dates = pd.get_dummies(artworks.Date)

# Concat with other variables, but artists slows this wayyyyy down so we'll keep it out for now
X = pd.get_dummies(X, sparse=True)
X = pd.concat([X, nationalities, dates], axis=1)

Y = artworks.Department

In [9]:
from sklearn.neural_network import MLPClassifier

## Takes like 20 mins
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [10]:
mlp.score(X, Y)

0.6706114580354755

In [11]:
Y.value_counts() / len(Y)

Drawings & Prints        0.622536
Photography              0.226449
Architecture & Design    0.112957
Painting & Sculpture     0.033686
Media and Performance    0.004372
Name: Department, dtype: float64

In [13]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([0.71298132, 0.75137204, 0.65160041, 0.68785167, 0.56510309])

In [15]:
## I interrupted some of the folds so presumably it just output what it had so far.

## Drill - playing with layers

In [20]:
## Reduce dataset size because it takes way too long

from sklearn.model_selection import train_test_split

x1, x2, y1, y2 = train_test_split(X, Y, test_size=0.9)

In [23]:
x1.shape

(10841, 320)

In [25]:
## Check it is shorter

start_time = time.process_time()

mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(x1, y1)

print(time.process_time() - start_time)

82.24539200000072


In [26]:
mlp.score(x1, y1)

0.6171939857946684

In [36]:
ypred = mlp.predict(x1)
pd.crosstab(ypred, y1)

Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,1039,1964,6,174,552
Drawings & Prints,106,4199,16,32,557
Media and Performance,0,3,21,1,3
Painting & Sculpture,25,159,2,167,28
Photography,34,472,4,12,1265


In [31]:
start_time = time.process_time()

mlp_cv = cross_val_score(mlp, x1, y1, cv=5)

print(time.process_time() - start_time)
print(mlp_cv)

185.88758099999905
[0.67603687 0.43107423 0.69741697 0.45156827 0.54385965]


#### Changing hidden layer structures

In [32]:
start_time = time.process_time()

mlp2 = MLPClassifier(hidden_layer_sizes=(100, 10))
mlp2.fit(x1, y1)

print(time.process_time() - start_time)

38.11434100000042




In [33]:
print(mlp2.score(x1, y1))

0.621160409556314


In [34]:
ypred2 = mlp2.predict(x1)

In [35]:
pd.crosstab(ypred2, y1)

Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,871,935,10,286,447
Drawings & Prints,333,5862,38,100,1958
Media and Performance,0,0,1,0,0


1 layer of 1000 perceptrons seems to be slower but more accurate than 10 layers of 100 perceptrons.

In [37]:
## Try another change

start_time = time.process_time()

mlp3 = MLPClassifier(hidden_layer_sizes=(250, 4))
mlp3.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp3.score(x1, y1))

ypred3 = mlp3.predict(x1)
pd.crosstab(ypred3, y1)

31.66483700000026
0.6269716815791901


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Drawings & Prints,1204,6797,49,386,2405


In [41]:
## Try another change

start_time = time.process_time()

mlp4 = MLPClassifier(hidden_layer_sizes=(500, 2))
mlp4.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp4.score(x1, y1))

ypred4 = mlp4.predict(x1)
pd.crosstab(ypred4, y1)

50.038250000001426
0.6269716815791901


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Drawings & Prints,1204,6797,49,386,2405


In [42]:
## Try another change

start_time = time.process_time()

mlp5 = MLPClassifier(hidden_layer_sizes=(100, 5, 2))
mlp5.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp5.score(x1, y1))

ypred5 = mlp5.predict(x1)
pd.crosstab(ypred5, y1)

17.301148999998986
0.6269716815791901


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Drawings & Prints,1204,6797,49,386,2405


In [43]:
## Try another change

start_time = time.process_time()

mlp6 = MLPClassifier(hidden_layer_sizes=(100, 5, 5))
mlp6.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp6.score(x1, y1))

ypred6 = mlp6.predict(x1)
pd.crosstab(ypred6, y1)

37.614246000001
0.6818559173507979


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,2,0,0,0,0
Drawings & Prints,1176,6230,47,383,1245
Photography,26,567,2,3,1160


In [44]:
## Try another change

start_time = time.process_time()

mlp7 = MLPClassifier(hidden_layer_sizes=(1000, 5, 5))
mlp7.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp7.score(x1, y1))

ypred7 = mlp7.predict(x1)
pd.crosstab(ypred7, y1)

65.74068099999931
0.6269716815791901


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Drawings & Prints,1204,6797,49,386,2405


In [45]:
## Try another change

start_time = time.process_time()

mlp8 = MLPClassifier(hidden_layer_sizes=(500, ))
mlp8.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8.score(x1, y1))

ypred8 = mlp8.predict(x1)
pd.crosstab(ypred8, y1)

18.59429199999795
0.5460750853242321


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,217,47,0,21,6
Drawings & Prints,237,3390,31,98,192
Media and Performance,0,0,1,0,0
Painting & Sculpture,30,38,2,117,12
Photography,720,3322,15,150,2195


In [46]:
## Try another change

start_time = time.process_time()

mlp9 = MLPClassifier(hidden_layer_sizes=(500, 20))
mlp9.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp9.score(x1, y1))

ypred9 = mlp9.predict(x1)
pd.crosstab(ypred9, y1)

75.0928079999976
0.6483719214094641


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,462,231,5,222,77
Drawings & Prints,742,6566,43,164,2328
Media and Performance,0,0,1,0,0


To conclude, the number of neurons in the first layer appears to improve performance much more than adding another layer.

#### Look at alpha

In [50]:
## Starting place

## Try another change

start_time = time.process_time()

mlp8 = MLPClassifier(random_state=43, hidden_layer_sizes=(500, ))
mlp8.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8.score(x1, y1))

ypred8 = mlp8.predict(x1)
pd.crosstab(ypred8, y1)

11.946041000002879
0.6801955539156904


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,192,29,0,12,15
Drawings & Prints,980,6737,46,326,1990
Media and Performance,0,0,1,0,0
Painting & Sculpture,12,8,1,47,3
Photography,20,23,1,1,397


In [49]:
## Change alpha from 0.0001 (default) to 0.00001

start_time = time.process_time()

mlp8a = MLPClassifier(random_state=43, hidden_layer_sizes=(500, ), alpha=0.00001)
mlp8a.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8a.score(x1, y1))

ypred8a = mlp8a.predict(x1)
pd.crosstab(ypred8a, y1)

14.760985999997501
0.45899824739415185


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,630,659,3,49,75
Drawings & Prints,65,1921,23,21,75
Media and Performance,0,0,2,1,1
Painting & Sculpture,80,204,3,215,46
Photography,429,4013,18,100,2208


In [51]:
## Increase alpha

start_time = time.process_time()

mlp8b = MLPClassifier(random_state=43, hidden_layer_sizes=(500, ), alpha=0.001)
mlp8b.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8b.score(x1, y1))

ypred8b = mlp8b.predict(x1)
pd.crosstab(ypred8b, y1)

16.756271000002016
0.7084217323125173


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,174,18,0,22,5
Drawings & Prints,824,6261,44,284,1206
Media and Performance,0,0,1,0,0
Painting & Sculpture,1,2,0,51,1
Photography,205,516,4,29,1193


In [53]:
## Increase alpha more

start_time = time.process_time()

mlp8c = MLPClassifier(random_state=43, hidden_layer_sizes=(500, ), alpha=0.005)
mlp8c.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8c.score(x1, y1))

ypred8c = mlp8c.predict(x1)
pd.crosstab(ypred8c, y1)

9.943005999997695
0.6601789502813393


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,269,83,0,12,51
Drawings & Prints,911,6690,45,286,2237
Media and Performance,0,0,1,0,0
Painting & Sculpture,24,22,2,88,8
Photography,0,2,1,0,109


Changing alpha basically seems to move stuff from photography (low alpha) to drawing and prints (higher alpha).

#### Look at activation

In [54]:
## Switch to logistic

start_time = time.process_time()

mlp8c1 = MLPClassifier(random_state=43, hidden_layer_sizes=(500, ), 
                      alpha=0.005, activation='logistic')
mlp8c1.fit(x1, y1)

print(time.process_time() - start_time)

print(mlp8c1.score(x1, y1))

ypred8c1 = mlp8c1.predict(x1)
pd.crosstab(ypred8c1, y1)

115.09492700000192
0.7541739691910341


Department,Architecture & Design,Drawings & Prints,Media and Performance,Painting & Sculpture,Photography
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Architecture & Design,344,35,0,26,11
Drawings & Prints,667,6129,45,242,798
Media and Performance,0,0,1,0,0
Painting & Sculpture,22,28,1,111,5
Photography,171,605,2,7,1591


In [None]:
## This improved the model by about 5%.