In [1]:
import pandas as pd
import numpy as np
import random

# 데이터셋 생성
np.random.seed(42)
artists = ["Pablo Picasso", "Vincent van Gogh", "Claude Monet", "Georgia O'Keeffe", "Frida Kahlo"]
categories = ["Painting", "Sculpture", "Photography", "Drawing"]
regions = ["North America", "Europe", "Asia", "Middle East", "South America"]
auction_houses = ["Christie's", "Sotheby's", "Phillips"]

data = {
    "AuctionID": [f"AID-{i}" for i in range(1, 1001)],
    "Year": np.random.randint(2010, 2023, 1000),
    "Artist": np.random.choice(artists, 1000),
    "ArtworkTitle": [f"Artwork {i}" for i in range(1, 1001)],
    "Category": np.random.choice(categories, 1000),
    "PriceUSD": np.random.randint(5000, 1000000, 1000),
    "EstimatedValueUSD": np.random.randint(4000, 950000, 1000),
    "BuyerRegion": np.random.choice(regions, 1000),
    "ResaleYear": np.random.choice(
        [np.nan] + list(range(2015, 2025)), 1000, p=[0.6] + [0.04]*10
    ),
    "ResalePriceUSD": [np.nan if np.random.rand() > 0.4 else np.random.randint(10000, 2000000) for _ in range(1000)],
    "CulturalSignificanceScore": np.random.randint(1, 11, 1000),
    "FinancialReturn": [
        np.nan if np.random.rand() > 0.4 else round(np.random.uniform(-0.5, 1.5), 2) * 100
        for _ in range(1000)
    ],
    "AuctionHouse": np.random.choice(auction_houses, 1000),
}

df = pd.DataFrame(data)

# 데이터셋을 저장하거나 바로 분석에 활용
df.to_csv("art_auction_dataset.csv", index=False)

In [2]:
df = pd.read_csv(r'C:\Users\USER\Desktop\my_git\learning-data-science\ADP\분석 연습\art_auction_dataset.csv')
df.head(5)

Unnamed: 0,AuctionID,Year,Artist,ArtworkTitle,Category,PriceUSD,EstimatedValueUSD,BuyerRegion,ResaleYear,ResalePriceUSD,CulturalSignificanceScore,FinancialReturn,AuctionHouse
0,AID-1,2016,Georgia O'Keeffe,Artwork 1,Painting,528886,337299,South America,2015.0,,7,137.0,Sotheby's
1,AID-2,2013,Pablo Picasso,Artwork 2,Photography,538884,759600,Asia,,,9,,Sotheby's
2,AID-3,2022,Pablo Picasso,Artwork 3,Painting,239627,63500,Asia,,,8,,Sotheby's
3,AID-4,2020,Claude Monet,Artwork 4,Drawing,636827,171647,Middle East,,,9,,Christie's
4,AID-5,2017,Frida Kahlo,Artwork 5,Sculpture,547601,240527,Asia,2020.0,,8,,Sotheby's


In [10]:
df_summary = df.describe(include='all')
print(df_summary)
print(df.info())

       AuctionID         Year         Artist ArtworkTitle  Category  \
count       1000  1000.000000           1000         1000      1000   
unique      1000          NaN              5         1000         4   
top        AID-1          NaN  Pablo Picasso    Artwork 1  Painting   
freq           1          NaN            203            1       260   
mean         NaN  2015.950000            NaN          NaN       NaN   
std          NaN     3.825197            NaN          NaN       NaN   
min          NaN  2010.000000            NaN          NaN       NaN   
25%          NaN  2013.000000            NaN          NaN       NaN   
50%          NaN  2016.000000            NaN          NaN       NaN   
75%          NaN  2019.000000            NaN          NaN       NaN   
max          NaN  2022.000000            NaN          NaN       NaN   

             PriceUSD  EstimatedValueUSD BuyerRegion   ResaleYear  \
count     1000.000000        1000.000000        1000   380.000000   
unique   

작품 유형별 평균 판매가 및 투자 수익률

In [12]:
category_analysis = df.groupby("Category").agg(
    AvgPriceUSE=("PriceUSD", "mean"),
    AvgFinancialReturn=("FinancialReturn", "mean"),
    Count=("Category", "size")
).reset_index()

category_analysis

Unnamed: 0,Category,AvgPriceUSE,AvgFinancialReturn,Count
0,Drawing,529164.114625,52.990654,253
1,Painting,486676.25,46.666667,260
2,Photography,496131.017544,53.433962,228
3,Sculpture,503161.239382,53.99115,259
