1. Cargamos las librerías que necesitamos.

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

2. Carga de datos

In [104]:
artists_df = pd.read_csv('artists.csv')

In [105]:
artworks_df = pd.read_csv('artworks.csv')

3. Analizamos las características de los dos datasets

In [106]:
print(artists_df.shape)

(15091, 6)


In [107]:
print(artists_df.columns)

Index(['Artist ID', 'Name', 'Nationality', 'Gender', 'Birth Year',
       'Death Year'],
      dtype='object')


In [108]:
print(artists_df.head())

   Artist ID             Name Nationality Gender  Birth Year  Death Year
0          1   Robert Arneson    American   Male      1930.0      1992.0
1          2   Doroteo Arnaiz     Spanish   Male      1936.0         NaN
2          3      Bill Arnold    American   Male      1941.0         NaN
3          4  Charles Arnoldi    American   Male      1946.0         NaN
4          5      Per Arnoldi      Danish   Male      1941.0         NaN


In [109]:
print(artworks_df.shape)

(130262, 21)


In [110]:
print(artworks_df.columns)

Index(['Artwork ID', 'Title', 'Artist ID', 'Name', 'Date', 'Medium',
       'Dimensions', 'Acquisition Date', 'Credit', 'Catalogue', 'Department',
       'Classification', 'Object Number', 'Diameter (cm)',
       'Circumference (cm)', 'Height (cm)', 'Length (cm)', 'Width (cm)',
       'Depth (cm)', 'Weight (kg)', 'Duration (s)'],
      dtype='object')


In [111]:
# Realizamos una primera revisión de los campos que tenemos y el tipo de datos, para eliminar de nuestro analisis.
print(artworks_df.head())

   Artwork ID                                              Title Artist ID  \
0           2  Ferdinandsbrücke Project, Vienna, Austria, Ele...      6210   
1           3  City of Music, National Superior Conservatory ...      7470   
2           4  Villa near Vienna Project, Outside Vienna, Aus...      7605   
3           5  The Manhattan Transcripts Project, New York, N...      7056   
4           6  Villa, project, outside Vienna, Austria, Exter...      7605   

                       Name  Date  \
0               Otto Wagner  1896   
1  Christian de Portzamparc  1987   
2                Emil Hoppe  1903   
3           Bernard Tschumi  1980   
4                Emil Hoppe  1903   

                                              Medium  \
0      Ink and cut-and-pasted painted pages on paper   
1                  Paint and colored pencil on print   
2  Graphite, pen, color pencil, ink, and gouache ...   
3  Photographic reproduction with colored synthet...   
4  Graphite, color pencil, i

In [112]:
num_unique_departments = artworks_df['Department'].nunique()
print(f"Number of unique values in 'Department': {num_unique_departments}")

Number of unique values in 'Department': 9


In [113]:
unique_departments = artworks_df['Department'].unique()
print("Unique values in 'Department':")
print(unique_departments)

Unique values in 'Department':
['Architecture & Design' 'Prints & Illustrated Books' 'Drawings'
 'Painting & Sculpture' 'Photography' 'Media and Performance Art' 'Film'
 'Architecture & Design - Image Archive' 'Fluxus Collection']


In [114]:
unique_classification = artworks_df['Classification'].unique()
print("Unique values in 'Classification':")
print(unique_classification)

Unique values in 'Classification':
['Architecture' 'Mies van der Rohe Archive' 'Design' 'Illustrated Book'
 'Print' 'Drawing' 'Film' 'Multiple' 'Periodical' 'Photograph' 'Painting'
 'Product Design' 'Photography Research/Reference' 'Media' 'Sculpture'
 'Textile' 'Installation' 'Video' 'Work on Paper' 'Audio' 'Performance'
 '(not assigned)' 'Film (object)' 'Ephemera' 'Collage'
 'Frank Lloyd Wright Archive' 'Furniture and Interiors' 'Software']


In [115]:
columns_to_drop = ['Medium', 'Credit', 'Catalogue',
       'Classification', 'Object Number', 'Diameter (cm)',
       'Circumference (cm)', 'Height (cm)', 'Length (cm)', 'Width (cm)',
       'Depth (cm)', 'Weight (kg)', 'Duration (s)']
artworks_df = artworks_df.drop(columns=columns_to_drop)

4. Revisamos los valores nulos para operar sobre ellos en el proceso de limpieza.

Comenzamos con el dataset artist

In [116]:
#Revismamos la presencia de valores NaN.
artists_df.isnull().sum()

Unnamed: 0,0
Artist ID,0
Name,0
Nationality,2488
Gender,3072
Birth Year,3854
Death Year,10512


In [117]:
# Convertimos las columnas a tipo string (cadena de texto) para aplicar operaciones de cadena y sustituimos los valores NaN transformados en nan.
artists_df['Birth Year'] = artists_df['Birth Year'].astype(str).str.replace('.0', '', regex=False).replace('nan', 'Year unknown', regex=False)
artists_df['Death Year'] = artists_df['Death Year'].astype(str).str.replace('.0', '', regex=False).replace('nan', 'Year unknown', regex=False)

# Visualizamos el cambio en los 5 primeros registos
print(artists_df[['Birth Year', 'Death Year']].head())

  Birth Year    Death Year
0       1930          1992
1       1936  Year unknown
2       1941  Year unknown
3       1946  Year unknown
4       1941  Year unknown


In [118]:
artists_df['Gender'] = artists_df['Gender'].fillna('Gender unknown')

In [119]:
artists_df['Nationality'] = artists_df['Nationality'].fillna('Nationality unknown')

In [120]:
artists_df.isnull().sum()

Unnamed: 0,0
Artist ID,0
Name,0
Nationality,0
Gender,0
Birth Year,0
Death Year,0


Continuamos con el dataset artworks

In [121]:
artworks_df.isnull().sum()

Unnamed: 0,0
Artwork ID,0
Title,62
Artist ID,1460
Name,1460
Date,2312
Dimensions,11463
Acquisition Date,5463
Department,0


En esta primera revisión evidenciamos que tenemos 1460 obras de las que desconocemos su artista, por lo que no son objeto de nuestro analisis, por lo que podemos borrar dichos registros.

In [133]:
artworks_df = artworks_df.dropna(subset=['Artist ID', 'Name', 'Acquisition Date'])

In [134]:
artworks_df_artist.head()

Unnamed: 0,Artwork ID,Title,Artist ID,Name,Date,Dimensions,Acquisition Date,Department
0,2,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",6210,Otto Wagner,1896,"19 1/8 x 66 1/2"" (48.6 x 168.9 cm)",1996-04-09,Architecture & Design
1,3,"City of Music, National Superior Conservatory ...",7470,Christian de Portzamparc,1987,"16 x 11 3/4"" (40.6 x 29.8 cm)",1995-01-17,Architecture & Design
2,4,"Villa near Vienna Project, Outside Vienna, Aus...",7605,Emil Hoppe,1903,"13 1/2 x 12 1/2"" (34.3 x 31.8 cm)",1997-01-15,Architecture & Design
3,5,"The Manhattan Transcripts Project, New York, N...",7056,Bernard Tschumi,1980,"20 x 20"" (50.8 x 50.8 cm)",1995-01-17,Architecture & Design
4,6,"Villa, project, outside Vienna, Austria, Exter...",7605,Emil Hoppe,1903,"15 1/8 x 7 1/2"" (38.4 x 19.1 cm)",1997-01-15,Architecture & Design


In [135]:
artworks_df['Title'] = artworks_df['Title'].fillna('Title unknown')

In [136]:
artworks_df['Date'] = artworks_df['Date'].fillna('Date unknown')

In [137]:
artworks_df['Dimensions'] = artworks_df['Dimensions'].fillna('Dimensions unknown')

In [138]:
artworks_df.isnull().sum()

Unnamed: 0,0
Artwork ID,0
Title,0
Artist ID,0
Name,0
Date,0
Dimensions,0
Acquisition Date,0
Department,0


5. Comenzamos con el análisis

In [128]:
female_artists = artists_df[artists_df['Gender'] == 'Female']
female_artists_count_by_nationality = female_artists.groupby('Nationality').size().reset_index(name='Count')
print("Count of female artists by Nationality:")
display(female_artists_count_by_nationality)

Count of female artists by Nationality:


Unnamed: 0,Nationality,Count
0,Afghan,1
1,Algerian,2
2,American,1046
3,Angolan,1
4,Argentine,25
...,...,...
62,Taiwanese,2
63,Turkish,5
64,Ukrainian,1
65,Venezuelan,5


In [129]:
display(artists_df.head())

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
0,1,Robert Arneson,American,Male,1930,1992
1,2,Doroteo Arnaiz,Spanish,Male,1936,Year unknown
2,3,Bill Arnold,American,Male,1941,Year unknown
3,4,Charles Arnoldi,American,Male,1946,Year unknown
4,5,Per Arnoldi,Danish,Male,1941,Year unknown


In [130]:
female_artists_df = artists_df[artists_df['Gender'] == 'Female']

In [131]:
female_artists_df

Unnamed: 0,Artist ID,Name,Nationality,Gender,Birth Year,Death Year
8,10,Irene Aronson,American,Female,1918,Year unknown
16,21,Ruth Asawa,American,Female,1926,2013
17,22,Isidora Aschheim,Israeli,Female,Year unknown,Year unknown
23,28,Geneviève Asse,French,Female,1923,Year unknown
25,31,Dana Atchley,American,Female,1941,2000
...,...,...,...,...,...,...
15013,50154,Ann Magnuson,American,Female,1956,Year unknown
15022,67012,Ka Markelius,Nationality unknown,Female,Year unknown,Year unknown
15034,67122,Giorgia Lupi,Italian,Female,1981,Year unknown
15044,67272,Toyin Ojih Odutola,American,Female,1985,Year unknown


In [132]:
artworks_df.isnull().sum()

Unnamed: 0,0
Artwork ID,0
Title,0
Artist ID,1460
Name,1460
Date,0
Dimensions,0
Acquisition Date,5463
Department,0
