In [None]:
#Imports 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sqlite3


import fuzzywuzzy # determine disstance berween words
from fuzzywuzzy import process

# kmeans for clustering
from sklearn import datasets 
from sklearn.cluster import KMeans

# plot charts
import matplotlib.pyplot as plt

artists = pd.read_csv('../input/museum-collection/artists.csv')
artworks = pd.read_csv('../input/museum-collection/artworks.csv')

In [None]:
# =======================================================
# Data quality review
# =======================================================

In [None]:
# Step 1
# =======================================================
# Handling missing values

In [None]:
# 1.1 Take a look on the samples of both tables and looking to see if there are missing values, 
# which could be reprsented with NaN or None.
artists.sample(5)

In [None]:
artworks.sample(2)

In [None]:
# 1.2 Count the number of missing values per column
# Artists
count_missing_values_artists = artists.isnull().sum()
count_missing_values_artists

In [None]:
# Artworks
count_missing_values_artworks = artworks.isnull().sum()
count_missing_values_artworks

In [None]:
# 1.3 
# Looks like a lot of missing values, but let's evaluate them in percents 
total_cells = np.product(artists.shape) # total cells
total_missing = count_missing_values_artists.sum() # total cells without values

# percent of data that is missing
(total_missing/total_cells) * 100

In [None]:
# The same for Artworks
total_cells = np.product(artworks.shape) 
total_missing = count_missing_values_artworks.sum() 
(total_missing/total_cells) * 100

In [None]:
# We have almost a quarter of missing values. Quite a lot. Let's continue to explore...

In [None]:
# Step 2 Convert values
# =======================================================
# Before checking duplicates it is a good practic to delete white spaces and to make all letters Upper case or lower case, 
# delete or replace caracters ",",".",.. etc.

# convert to upper case
artists['Name'] = artists['Name'].str.upper()
artists['Nationality'] = artists['Nationality'].str.upper()
artists['Gender'] = artists['Gender'].str.upper()

# remove trailing white spaces
artists['Name'] = artists['Name'].str.strip()
artists['Nationality'] = artists['Nationality'].str.strip()
artists['Gender'] = artists['Gender'].str.strip()

In [None]:
# Step 3 Check duplicates values
# =======================================================


# 3.1 Check Artist table on duplicates of Artist ID and Name
artists[artists.duplicated(subset=['Artist ID'], keep=False)] 

In [None]:
# There is no one duplicate Artist ID. It's good. 
# Check duplicates of Name

In [None]:
artists[artists.duplicated(subset=['Name'], keep=False)]
# We can see that we have 73 rows of Names which are at least two times in the table. 
# But we know that people can have the same names... Let's explore these values..

In [None]:
# I would like to take a look on these duplicates and sort then by Name to get the same Names together.

artists[artists.duplicated(subset=['Name'], keep=False)][0:60].sort_values(by=['Name'])

# Cases:
# --- We can see that some Names are "UNKNOWN DESIGNER"/ "UNKNOWN ARTIST" / "UNKNOWN" without any other data. 
#     I propose to delete these rows.

# --- In case of  "JOHANN LOETZ" we have to investigete if it is one person or not.
# --- In case of "DANESE S.R.L., ITALY" and "J.A. HENCKELS, SOLINGEN, GERMANY", "ROBERT DAWSON"
#     we have to delete lines where Artist ID are 9409 and 10857, 37602. Also modify this IDs in Artworks table.
# --- In case of "CARL AUBÖCK" looks like it is OK. Maybe they are father and son.


In [None]:
# Step 4 Check inconsistent data entry and replace values
# =======================================================

# 4.1 In columne Nationality replace "NATIONALITY UNKNOWN" to None 
artists.loc[(artists.Nationality == 'NATIONALITY UNKNOWN'),'Nationality']='None'

In [None]:
# 4.2 Check Names in the columne Names

# As example, I have found two different names 'AB GUSTAVSBERG FABRIKER, SWEDEN' and 'AB GUSTAVSBERG, SWEDEN'
# but it is the same company. I guess there is a lot of data entry errors in this column. 
# Also split the Name values such as "J.A. HENCKELS, SOLINGEN, GERMANY" to Name and Nationality.
# It is enough long work to fix them.

# We could use fuzzywuzzy package to help identify which string are closest to each other.

# extract all names
names = artists['Name'].unique()

# get the top 5 closest matches to "AB GUSTAVSBERG, SWEDEN"
matches = fuzzywuzzy.process.extract("AB GUSTAVSBERG, SWEDEN", names, limit=5, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# print top 5
matches

In [None]:
# Step 5 Check data types
# =======================================================

#5.1 Check value "Birth Year" and "Death Year"
print(artists['Birth Year'].head())

In [None]:
print(artists['Death Year'].head())

In [None]:
# Both fields have type float64. We have to convert them to datetime
artists['Birth Year'] = pd.to_datetime(artists['Birth Year'], format = "%Y",errors='coerce')
# print the first few rows
artists['Birth Year'].head()

In [None]:
artists['Death Year'] = pd.to_datetime(artists['Death Year'], format = "%Y",errors='coerce')
artists['Death Year'].head()

In [None]:
# Make the similar steps to check data in the Artworks table:
# --- Missing values, 
# --- fixe inconsistent data entry, 
# --- duplicates, 
# --- replace values, 
# --- data types
# Also can be found
# --- all possible not correct character encoding

In [None]:
artworks.sample(2)

In [None]:
# =======================================================
# Clustering of artworks
# =======================================================


In [None]:
# The idea is to define new clusters depends on size of art picture and Birth Year of an artists.
# As example, to get groups like:
# "Lost Generation - low dimention pictures"
# "Generation X - low dimention pictures"
# "Generation Y - Hight dimention pictures"
# etc

# I have chosen Kmeans algorithm.

In [None]:
# prepare a data set for clustering
# - joined Artworks and Artists 

joint_df = pd.merge(artworks, artists, on='Name', how='left')

In [None]:
# - deleted rows where Birth Year contained null, Nan, None values.
# - deleted null values and outliers for Height (cm) and Width (cm)


In [None]:
joint_df=joint_df[['Height (cm)','Width (cm)']]

joint_df = joint_df[joint_df['Height (cm)'].between(0.1,400, inclusive=True)]
joint_df = joint_df[joint_df['Width (cm)'].between(0.1,400, inclusive=True)]
#joint_df['Birth Year'] = joint_df['Birth Year'].dt.year



In [None]:
# print the table
joint_df


In [None]:
#joint_df.dropna(inplace=True)

In [None]:
#joint_df

In [None]:
# define an array and assign values from joint_df DataFrame
samples = joint_df.values
samples

In [None]:
# define the model, where n_clusters is a number of clusters.
model = KMeans(n_clusters=4)

In [None]:
#modeling
model.fit(samples.data)

In [None]:
# The result is in the array all_predictions which size corresponds to the number of rows in the joint_df DataFrame
all_predictions = model.predict(samples.data)
print(all_predictions[0:9]) # print first 10

In [None]:
# the same array
model.labels_[0:10]

In [None]:
# centers of clusters are:
model.cluster_centers_
# Year values we have to change to int values... TBD...

In [None]:
# 2D Plot of clusters
x_axis = joint_df['Height (cm)']  
y_axis = joint_df['Width (cm)']  

plt.title("2D Viz of clusters")
plt.xlabel('Height size of the artwork')
plt.ylabel('Width size of the artwork')

plt.scatter(x_axis, y_axis, c=all_predictions)
plt.show()

In [None]:
# 3D Plot of clusters
#x_axis = joint_df['Width (cm)']  
#y_axis = joint_df['Birth Year']  
#z_axis = joint_df['Height (cm)']  

#ax = plt.axes(projection ="3d")

#plt.title("3D Viz of clusters")
#ax.set_xlabel('Width')
#ax.set_zlabel('Height')
#ax.set_ylabel('Birth Year')

#ax.scatter3D(x_axis, y_axis, z_axis, c = all_predictions)
#plt.show()

In [None]:
# Each predict value from 0 to n we can match to string such as
# "Lost Generation - low dimention pictures"
# "Generation X - low dimention pictures"
# "Generation Y - Hight dimention pictures"
# etc
# and save this values in Artworks table