In [3]:
import pandas as pd
import json

# 전처리 함수 정의
def extract_country(item):
    try:
        data = json.loads(item.replace("'", '"'))
        return data.get("country", "Unknown")
    except:
        return "Unknown"

def extract_original_price(mp):
    try:
        data = json.loads(mp.replace("'", '"'))
        return float(data.get("originalPrice", {}).get("value", None))
    except:
        return None

def extract_discount_ratio(mp):
    try:
        data = json.loads(mp.replace("'", '"'))
        return float(data.get("discountPercentage", None))
    except:
        return None

def extract_price(p):
    try:
        data = json.loads(p.replace("'", '"'))
        return float(data.get("value", None))
    except:
        return None

def extract_feedback_percentage(seller):
    try:
        data = json.loads(seller.replace("'", '"'))
        return float(data.get("feedbackPercentage", None))
    except:
        return None

def extract_feedback_score(seller):
    try:
        data = json.loads(seller.replace("'", '"'))
        return int(data.get("feedbackScore", None))
    except:
        return None

def extract_shipping_type(opt):
    try:
        data = json.loads(opt.replace("'", '"'))
        return data[0].get("shippingCostType", "Unknown")
    except:
        return "Unknown"

def extract_shipping_cost(opt):
    try:
        data = json.loads(opt.replace("'", '"'))
        cost = data[0].get("shippingCost", {}).get("value", "Unknown")
        return float(cost) if cost != "Unknown" else "Unknown"
    except:
        return "Unknown"

# CSV 파일 로딩 (인코딩 오류 대비)
file_path = "장윤서.csv"
try:
    df = pd.read_csv(file_path)
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding="ISO-8859-1")

# 파생 변수 생성 및 기존 컬럼 대체
df["itemLocation"] = df["itemLocation"].apply(extract_country)
df["discountPrice"] = df["marketingPrice"].apply(extract_original_price)
df["discountRatio"] = df["marketingPrice"].apply(extract_discount_ratio)
df["price"] = df["price"].apply(extract_price)
df["feedbackPercentage"] = df["seller"].apply(extract_feedback_percentage)
df["feedbackScore"] = df["seller"].apply(extract_feedback_score)
df["shippingCostType"] = df["shippingOptions"].apply(extract_shipping_type)
df["shippingCost"] = df["shippingOptions"].apply(extract_shipping_cost)

# 저장
output_path = "장윤서_2.csv"
df.to_csv(output_path, index=False)


In [5]:
df2 = pd.read_csv('장윤서_2.csv')

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70669 entries, 0 to 70668
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   availableCoupons          70669 non-null  bool   
 1   bidCount                  615 non-null    float64
 2   buyingOptions             70669 non-null  object 
 3   categories                70669 non-null  object 
 4   condition                 70636 non-null  object 
 5   currentBidPrice           615 non-null    object 
 6   itemCreationDate          70669 non-null  object 
 7   itemEndDate               615 non-null    object 
 8   itemId                    70669 non-null  object 
 9   itemLocation              70669 non-null  object 
 10  itemOriginDate            70669 non-null  object 
 11  leafCategoryIds           70669 non-null  object 
 12  listingMarketplaceId      69874 non-null  object 
 13  marketingPrice            16664 non-null  object 
 14  price 

In [35]:
import pandas as pd
import json
import os

# 전처리 함수 정의
def extract_country(item):
    try:
        data = json.loads(item.replace("'", '"'))
        return data.get("country", "Unknown")
    except:
        return "Unknown"

def extract_original_price(mp):
    try:
        data = json.loads(mp.replace("'", '"'))
        return float(data.get("originalPrice", {}).get("value", None))
    except:
        return None

def extract_discount_ratio(mp):
    try:
        data = json.loads(mp.replace("'", '"'))
        return float(data.get("discountPercentage", None))
    except:
        return None

def extract_price(p):
    try:
        data = json.loads(p.replace("'", '"'))
        return float(data.get("value", None))
    except:
        return None

def extract_feedback_percentage(seller):
    try:
        data = json.loads(seller.replace("'", '"'))
        return float(data.get("feedbackPercentage", None))
    except:
        return None

def extract_feedback_score(seller):
    try:
        data = json.loads(seller.replace("'", '"'))
        return int(data.get("feedbackScore", None))
    except:
        return None

def extract_shipping_type(opt):
    try:
        data = json.loads(opt.replace("'", '"'))
        return data[0].get("shippingCostType", "Unknown")
    except:
        return "Unknown"

def extract_shipping_cost(opt):
    try:
        data = json.loads(opt.replace("'", '"'))
        cost = data[0].get("shippingCost", {}).get("value", "Unknown")
        return float(cost) if cost != "Unknown" else "Unknown"
    except:
        return "Unknown"

# 파일 목록
file_list = [
    "장윤서.csv",
    "강지수.csv",
    "노현비.csv",
    "황영재.csv",
    "김정운.csv"
]

# 파일 전처리 후 병합
processed_dfs = []

for file_path in file_list:
    try:
        df = pd.read_csv(file_path)
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding="ISO-8859-1")

    df["itemLocation"] = df["itemLocation"].apply(extract_country)
    df["discountPrice"] = df["marketingPrice"].apply(extract_original_price)
    df["discountRatio"] = df["marketingPrice"].apply(extract_discount_ratio)
    df["price"] = df["price"].apply(extract_price)
    df["feedbackPercentage"] = df["seller"].apply(extract_feedback_percentage)
    df["feedbackScore"] = df["seller"].apply(extract_feedback_score)
    df["shippingCostType"] = df["shippingOptions"].apply(extract_shipping_type)
    df["shippingCost"] = df["shippingOptions"].apply(extract_shipping_cost)

    processed_dfs.append(df)

# 데이터 병합
merged_df = pd.concat(processed_dfs, ignore_index=True)

# 저장
output_path = "RawData.csv"
merged_df.to_csv(output_path, index=False)


  df = pd.read_csv(file_path)


In [36]:
df = pd.read_csv('RawData.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 835132 entries, 0 to 835131
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   availableCoupons          835132 non-null  bool   
 1   bidCount                  1200 non-null    float64
 2   buyingOptions             835132 non-null  object 
 3   categories                835132 non-null  object 
 4   condition                 834233 non-null  object 
 5   currentBidPrice           1200 non-null    object 
 6   itemCreationDate          835132 non-null  object 
 7   itemEndDate               1200 non-null    object 
 8   itemId                    835132 non-null  object 
 9   itemLocation              835132 non-null  object 
 10  itemOriginDate            835132 non-null  object 
 11  leafCategoryIds           835132 non-null  object 
 12  listingMarketplaceId      803902 non-null  object 
 13  marketingPrice            240157 non-null  o

In [37]:
df.head()

Unnamed: 0,availableCoupons,bidCount,buyingOptions,categories,condition,currentBidPrice,itemCreationDate,itemEndDate,itemId,itemLocation,...,seller,shippingOptions,title,topRatedBuyingExperience,discountPrice,discountRatio,feedbackPercentage,feedbackScore,shippingCostType,shippingCost
0,False,,['FIXED_PRICE'],"[{'categoryId': '262016', 'categoryName': 'Col...",Pre-owned,,2025-06-14T06:59:29.000Z,,v1|197434763288|0,US,...,"{'username': 'greatfindsallkinds', 'feedbackPe...","[{'shippingCostType': 'FIXED', 'shippingCost':...",Jewelry Lot All Good Wear Resell Brooch Neckla...,False,,,100.0,247,FIXED,0.0
1,False,,['FIXED_PRICE'],"[{'categoryId': '155101', 'categoryName': 'Nec...",New with tags,,2022-02-11T06:06:24.000Z,,v1|255380790240|555349556632,US,...,"{'username': 'liqinlu0', 'feedbackPercentage':...","[{'shippingCostType': 'FIXED', 'shippingCost':...",925 Silver Chain Baby's Breath Snake Square Wo...,False,,,97.6,46693,FIXED,0.0
2,False,,['FIXED_PRICE'],"[{'categoryId': '155101', 'categoryName': 'Nec...",New with tags,,2024-03-13T14:35:06.000Z,,v1|315221500858|0,US,...,"{'username': 'relaxingfox', 'feedbackPercentag...","[{'shippingCostType': 'FIXED', 'shippingCost':...",Fashion Jewelry Silver Basketball Pendant Neck...,True,12.95,10.0,98.8,1944,FIXED,2.5
3,False,,"['FIXED_PRICE', 'BEST_OFFER']","[{'categoryId': '262016', 'categoryName': 'Col...",Pre-owned,,2024-12-16T01:07:54.000Z,,v1|396015645951|0,US,...,"{'username': 'adstell1200', 'feedbackPercentag...","[{'shippingCostType': 'FIXED', 'shippingCost':...",Vintage Estate Now Jewelry Lot 5 pieces NO jun...,False,,,70.5,32,FIXED,0.0
4,False,,"['FIXED_PRICE', 'BEST_OFFER']","[{'categoryId': '262016', 'categoryName': 'Col...",Pre-owned,,2024-03-25T03:41:30.000Z,,v1|186361363809|0,US,...,"{'username': 'bringinghappyhome', 'feedbackPer...","[{'shippingCostType': 'FIXED', 'shippingCost':...",3 Lb Pounds Unsearched Huge Lot Jewelry Vintag...,False,,,91.9,10742,FIXED,10.99


In [39]:
sample_df = df.sample(n=5000, random_state=42)  # 랜덤 고정은 선택

sample_df.to_csv('Sample.csv', index = False)

In [4]:
import pandas as pd
df = pd.read_csv('RawData.csv', encoding = 'cp949')

df.nunique()

availableCoupons                 2
bidCount                         2
buyingOptions                    3
categories                    4582
condition                       63
currentBidPrice                330
itemCreationDate            332874
itemEndDate                    835
itemId                      379462
itemLocation                   116
itemOriginDate              334511
leafCategoryIds               3911
listingMarketplaceId             8
marketingPrice               55370
price                        19374
priorityListing                  2
seller                      213476
shippingOptions               7311
title                       338743
topRatedBuyingExperience         2
discountPrice                11163
discountRatio                  161
feedbackPercentage             290
feedbackScore                40641
shippingCostType                 3
shippingCost                  2027
category_Total                4285
category_1                      31
category_2          

In [6]:
df.nunique().sort_values(ascending = False)

itemId                      379462
title                       338743
itemOriginDate              334511
itemCreationDate            332874
seller                      213476
marketingPrice               55370
feedbackScore                40641
price                        19374
discountPrice                11163
shippingOptions               7311
categories                    4582
category_Total                4285
leafCategoryIds               3911
shippingCost                  2027
category_2                    1968
itemEndDate                    835
currentBidPrice                330
feedbackPercentage             290
discountRatio                  161
itemLocation                   116
condition                       63
category_1                      31
listingMarketplaceId             8
shippingCostType                 3
buyingOptions                    3
priorityListing                  2
bidCount                         2
topRatedBuyingExperience         2
availableCoupons    