❗ 파일 주소는 각자 수정해주세요...

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('./data/okcupid_profiles.csv')

In [56]:
# 필요 컬럼
required_cols = [
    "sex","orientation","body_type","drinks","drugs","education","height","job",
    "last_online","religion","sign","smokes"
]

# keep 컬럼
keep_cols = ["age","status","diet","ethnicity","location","pets"]

# 사용 컬럼
use_cols = required_cols + keep_cols
df = df[use_cols].copy()

In [57]:
# 성별 인코딩 
df['sex'] = df['sex'].map({"f":0, "m":1}).astype("int8")


# 성적 지향 인코딩
df["orientation"] = df["orientation"].astype("string").str.strip().str.lower()

# 성소수자 매핑
minor_map = {
    "straight": 0,   # 이성애자
    "gay": 1,        # 동성애자 (성소수자)
    "bisexual": 1    # 양성애자 (성소수자)
}

df["orientation"] = df["orientation"].map(minor_map).astype("int8")


# height 전처리
df = df.drop(df.index[df['height'].isna()])     # 결측치 3개 제거
df = df.drop(df.index[df['height'] < 30])       # 이상치 7개 제거 : 30inch = 76.2cm

In [58]:
# 석사 이상
high_edu = ['graduated from masters program', 'working on masters program',
           'graduated from ph.d program', 'working on ph.d program',
           'graduated from med school', 'working on law school',
           'working on med school', 'dropped out of masters program',
           'masters program', 'dropped out of ph.d program',
           'ph.d program', 'law school', 'graduated from law school',
           'dropped out of law school', 'dropped out of med school',
           'med school'
]
# 대학교 졸업
univ_grad = ['graduated from college/university', 'graduated from two-year college',
             'college/university'
]
# 대학교 재학
univ_ing = ['working on college/university', 'working on two-year college',
            'dropped out of college/university', 'two-year college',
            'dropped out of two-year college'
]
# 고등학교 졸업 이하
high_school = ['graduated from high school', 'dropped out of high school',
               'high school', 'working on high school'
]
# 나머지
other = ['graduated from space camp', 'dropped out of space camp',
         'working on space camp', 'space camp'
]
# education 컬럼 값 통합
df['education'] = df['education'].replace(high_edu, 'high_edu')
df['education'] = df['education'].replace(univ_grad, 'univ_grad')
df['education'] = df['education'].replace(univ_ing, 'univ_ing')
df['education'] = df['education'].replace(high_school, 'high_school')
df['education'] = df['education'].replace(other, 'other')

# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, columns=['education'], drop_first=True)

In [59]:
def classify_belief(text):
    # 결측치 처리 (NaN이면 안 믿는다로 분류)
    if pd.isna(text):
        return 0
    
    # 믿는 그룹의 키워드가 포함되어 있는지 확인
    if "fun to think about" in text or "matters a lot" in text:
        return 1
    
    # 그 외 (doesn't matter 또는 별자리 이름만 있는 경우)
    else:
        return 0

# 새로운 컬럼 생성 (두번 실행 금지)
df['sign'] = df['sign'].apply(classify_belief)

# 결과 확인
print(df['sign'].value_counts())

sign
0    39931
1    20005
Name: count, dtype: int64


In [60]:
# used up, rather not say - 결측 처리
df["body_type"] = df["body_type"].replace({"used up": np.nan, "rather not say": np.nan})

body_map = {
    # 마름
    "thin": "slim",
    "skinny": "slim",

    # 보통
    "average": "average",

    # 건강
    "fit": "fit",
    "athletic": "fit",
    "jacked": "fit",

    # 통통
    "curvy": "curvy",
    "a little extra": "curvy",
    "full figured": "curvy",
    "overweight": "curvy",
}

df["body_type"] = df["body_type"].map(body_map)

# 원-핫 인코딩
df = pd.get_dummies(df, columns=['body_type'])

In [61]:
# 비흡연
no_smoke = ['no']
# 가끔 흡연
sometime_smoke = ['sometimes', 'when drinking']
# 흡연
smoke = ['yes', 'trying to quit']

# smokes 컬럼 값 통합
df['smokes'] = df['smokes'].replace(no_smoke, 'no_smoke')
df['smokes'] = df['smokes'].replace(sometime_smoke, 'sometime_smoke')
df['smokes'] = df['smokes'].replace(smoke, 'smoke')

# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, columns=['smokes'], drop_first=True)

In [62]:
# 1. 결측치 처리 (가장 빈도가 높은 'never'로 채우기)
df['drugs'] = df['drugs'].fillna('never')

# 2. 값 매핑 (never: 2, sometimes: 1, often: 0)
drug_map = {
    'never': 2,
    'sometimes': 1,
    'often': 0
}

df['drugs'] = df['drugs'].map(drug_map)

# 결과 확인
print(df['drugs'].value_counts())

drugs
2    51795
1     7732
0      409
Name: count, dtype: int64


In [63]:
df['drinks'].value_counts()

drinks_map = {
    # 안 마신다 (9224개)
    "not at all": "no_drinks",
    "rarely": "no_drinks",

    # 적당히 마신다 (41776개)
    "socially": "moderate",

    # 많이 마신다 (5954개)
    "often": "heavy",
    "very often": "heavy",
    "desperately": "heavy"
}

df["drinks"] = df["drinks"].map(drinks_map)

df = pd.get_dummies(df, columns=['drinks'], drop_first=False)

In [64]:
# 마지막 온라인 시간 - 시간 삭제
df["last_online"] = pd.to_datetime(df["last_online"].str[:10], format="%Y-%m-%d")

In [65]:
# job 컬럼 인코딩
df['job_encoding'] = np.select(
    [((df['job'] == 'science / tech / engineering') | (df['job'] == 'computer / hardware / software')),
     ((df['job'] == 'sales / marketing / biz dev') | (df['job'] == 'executive / management') | (df['job'] == 'banking / financial / real estate')),
     ((df['job'] == 'artistic / musical / writer') | (df['job'] == 'entertainment / media')),
     ((df['job'] == 'education / academia') | (df['job'] == 'medicine / health') | (df['job'] == 'political / government') | (df['job'] == 'law / legal services')),
     ((df['job'] == 'hospitality / travel') | (df['job'] == 'construction / craftsmanship') | (df['job'] == 'clerical / administrative') | (df['job'] == 'transportation') | (df['job'] == 'military'))],
     # student, unemployed, retired, other, rather not say, 결측치는 모두 6
    [1, 2, 3, 4, 5],
    default=6
)

# job_encoding 컬럼의 value_count() 출력
print(df['job_encoding'].value_counts())

job_encoding
6    21626
1     9555
4     9281
2     9030
3     6685
5     3759
Name: count, dtype: int64


In [None]:
# 무교
religion_none = ['agnosticism', 'atheism', 
                 'atheism and laughing about it', 'atheism but not too serious about it',
                 'atheism and somewhat serious about it', 'agnosticism but not too serious about it',
                 'agnosticism and laughing about it', 'agnosticism and somewhat serious about it',
                 'atheism and very serious about it', 'agnosticism and very serious about it',
                 ]
# 종교 신자
religion = ['other', 'catholicism but not too serious about it',
            'other and laughing about it', 'christianity',
            'christianity but not too serious about it', 'other but not too serious about it',
            'judaism but not too serious about it', 'catholicism',
            'christianity and somewhat serious about it', 'other and somewhat serious about it',
            'catholicism and laughing about it', 'judaism and laughing about it',
            'buddhism but not too serious about it', 'judaism',
            'christianity and very serious about it', 'catholicism and somewhat serious about it',
            'other and very serious about it', 'buddhism and laughing about it',
            'buddhism', 'christianity and laughing about it',
            'buddhism and somewhat serious about it', 'judaism and somewhat serious about it',
            'hinduism but not too serious about it', 'hinduism', 'catholicism and very serious about it',
            'buddhism and very serious about it', 'hinduism and somewhat serious about it',
            'islam', 'hinduism and laughing about it', 'islam but not too serious about it',
            'islam and somewhat serious about it', 'judaism and very serious about it',
            'islam and laughing about it', 'hinduism and very serious about it', 'islam and very serious about it']
# religion 컬럼 값 통합
df['religion'] = df['religion'].replace(religion_none, 'no_religion')
df['religion'] = df['religion'].replace(religion, 'religion')
# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, columns=['religion'], drop_first=False)


status_encoding
1    55687
0     4249
Name: count, dtype: int64


In [67]:
# status 컬럼 인코딩

df['status_encoding'] = np.select(
    [df['status'] == 'single'], 
    [1],
    default=0
)
# status_encoding 컬럼의 value_count() 출력
print(df['status_encoding'].value_counts())

status_encoding
1    55687
0     4249
Name: count, dtype: int64


In [68]:
 # age 이상치 제거 (70세 이상) 
df = df[df["age"].between(18, 70)].copy()

# 연령대별 그룹핑
df["age_group"] = (df["age"] // 10 * 10).astype(int)
print(df["age_group"].value_counts())

age_group
20    27817
30    19841
40     7337
50     2860
60     1159
10      920
Name: count, dtype: int64


In [69]:
# diet 컬럼 인코딩

df['diet_encoding'] = np.select(
    [((df['diet'] == 'vegetarian') | (df['diet'] == 'mostly vegetarian') | (df['diet'] == 'strictly vegetarian') |\
      (df['diet'] == 'vegan') | (df['diet'] == 'mostly vegan') | (df['diet'] == 'strictly vegan'))], 
    [1],
    default=0
)

# diet_encoding 컬럼의 value_count() 출력
print(df['diet_encoding'].value_counts())

diet_encoding
0    54247
1     5687
Name: count, dtype: int64


In [70]:
# 전체 데이터 리스트화
all = df['ethnicity'].value_counts().index.tolist()
# mixed가 아닌거 리스트
mixed_none = ['white', 'asian', 'black', 'hispanic / latin', 'other']
# mixed
remaining = [item for item in all if item not in mixed_none]
# mixed로 통합
df['ethnicity'] = df['ethnicity'].replace(remaining, 'mixed')
# 범주형 변수 원-핫 인코딩
df = pd.get_dummies(df, columns=['ethnicity'], drop_first=False)

In [71]:
# pets 컬럼 인코딩 -> dogs_encoding, cats_encoding

df['dogs_encoding'] = np.select(
    [((df['pets'] == 'has dogs') | (df['diet'] == 'likes dogs and likes cats') |\
      (df['pets'] == 'likes dogs and has cats') | (df['diet'] == 'likes dogs') |\
      (df['pets'] == 'has dogs and has cats') | (df['diet'] == 'has dogs and dislikes cats') |\
      (df['pets'] == 'has dogs and likes cats') | (df['diet'] == 'likes dogs and dislikes cats'))], 
    [1],
    default=0
)

df['cats_encoding'] = np.select(
    [((df['pets'] == 'has cats') | (df['diet'] == 'likes dogs and likes cats') |\
      (df['pets'] == 'likes dogs and has cats') | (df['diet'] == 'likes cats') |\
      (df['pets'] == 'has dogs and has cats') | (df['diet'] == 'has dogs and likes cats') |\
      (df['pets'] == 'dislikes dogs and likes cats') | (df['diet'] == 'dislikes dogs and has cats'))], 
    [1],
    default=0
)

# dogs_encoding 컬럼과 cats_encoding 컬럼의 value_count() 출력
print(df['dogs_encoding'].value_counts())
print(df['cats_encoding'].value_counts())

dogs_encoding
0    47682
1    12252
Name: count, dtype: int64
cats_encoding
0    52503
1     7431
Name: count, dtype: int64


In [72]:
# city 추출
df["location"] = df["location"].astype("string").str.strip().str.lower()
df["city"] = df["location"].str.split(",").str[0].str.strip()

# 지역 매핑
SF = {"san francisco"}

PENINSULA_SOUTH_BAY = {
    "daly city", "south san francisco", "san bruno", "millbrae", "san mateo",
    "burlingame", "foster city", "belmont", "san carlos", "redwood city",
    "redwood shores", "menlo park", "atherton", "east palo alto",
    "palo alto", "mountain view", "stanford", "brisbane", "colma", "hillsborough",
    "pacifica", "half moon bay", "montara", "moss beach", "el granada"
}

EAST_BAY = {
    "oakland", "west oakland", "berkeley", "alameda", "emeryville", "albany",
    "hayward", "san leandro", "san lorenzo", "castro valley", "union city", "fremont",
    "richmond", "point richmond", "el cerrito", "san pablo", "el sobrante", "pinole", "hercules",
    "walnut creek", "pleasant hill", "martinez", "lafayette", "orinda", "moraga", "concord",
    "piedmont", "kensington", "pacheco", "rodeo", "crockett", "port costa", "oakley"
}

NORTH_BAY = {
    "sausalito", "mill valley", "tiburon", "belvedere tiburon", "larkspur", "corte madera",
    "kentfield", "fairfax", "san anselmo", "san rafael", "ross", "green brae",
    "marin city", "bolinas", "lagunitas", "woodacre", "forest knolls", "nicasio",
    "olema", "muir beach", "stinson beach", "novato",
    "petaluma", "rohnert park", "santa rosa",
    "vallejo", "benicia", "vacaville"
}

# 지역 매핑 함수
def map_bay_region(city: str) -> str:
    if pd.isna(city):
        return np.nan
    if city in SF:
        return "sf"
    if city in PENINSULA_SOUTH_BAY:
        return "south_bay"
    if city in EAST_BAY:
        return "east_bay"
    if city in NORTH_BAY:
        return "north_bay"
    return "outside"

df["location_group"] = df["city"].apply(map_bay_region)

# 원-핫 인코딩
df = pd.get_dummies(df, columns=["location_group"], drop_first=False)

In [None]:
# 전처리 끝난 df를 CSV로 저장 
df.to_csv("./data/okcupid_preprocessed.csv", encoding="utf-8-sig")