In [6]:
import pandas as pd

df = pd.read_csv('RawData.csv', encoding = 'CP949')

df = df.drop_duplicates()

# 1. Condition 그룹화
# 1. 결측치를 먼저 'Unknown' 문자열로 변환
df['condition'] = df['condition'].fillna('Unknown')

# 2. 매핑 함수 적용
def map_condition(condition):
    condition = condition.lower()
    if any(x in condition for x in ['brand new', 'new with tags', 'new/factory sealed', 'neuf', 'neu', 'new: never used', 'opened – never used']):
        return 'New'
    elif any(x in condition for x in ['pre-owned', 'used', 'good', 'very good', 'acceptable', 'usato', 'gebraucht']):
        return 'Used'
    elif any(x in condition for x in ['seller refurbished', 'certified refurbished', 'excellent - refurbished', 'gut - refurbished', 'refurbished']):
        return 'Refurbished'
    elif any(x in condition for x in ['open box', 'new: other', 'new without tags', 'new without box']):
        return 'Open box'
    elif any(x in condition for x in ['for parts or not working', 'remanufactured']):
        return 'Parts or Not Working'
    elif any(x in condition for x in ['unknown', 'unspecified']):
        return 'Unknown'
    else:
        return 'Unknown'

# 3. 새로운 그룹 컬럼 생성
df['condition_group'] = df['condition'].apply(map_condition)


# 2. marketingPrice 컬럼 삭제
df = df.drop(columns=['marketingPrice'], errors='ignore')

# 3. shippingOptions 컬럼 삭제
df = df.drop(columns=['shippingOptions'], errors='ignore')

# 4. discountPrice 컬럼 삭제
df = df.drop(columns=['discountPrice'], errors='ignore')

# 5. discountRatio 결측값을 0으로 대체
df['discountRatio'] = df['discountRatio'].fillna(0)

# 6. MarketingPriceId 결측값을 'Unknown'으로 대체
df['listingMarketplaceId'] = df['listingMarketplaceId'].fillna('Unknown')


df.to_csv('CleanData.csv', index = False)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 835132 entries, 0 to 835131
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   availableCoupons          835132 non-null  bool   
 1   bidCount                  1200 non-null    float64
 2   buyingOptions             835132 non-null  object 
 3   categories                835132 non-null  object 
 4   condition                 834233 non-null  object 
 5   currentBidPrice           1200 non-null    object 
 6   itemCreationDate          835132 non-null  object 
 7   itemEndDate               1200 non-null    object 
 8   itemId                    835132 non-null  object 
 9   itemLocation              835132 non-null  object 
 10  itemOriginDate            835132 non-null  object 
 11  leafCategoryIds           835132 non-null  object 
 12  listingMarketplaceId      803902 non-null  object 
 13  marketingPrice            240157 non-null  o

In [7]:
# 전체 중복된 행 개수
df.duplicated().sum()


4627