In [None]:
import pandas as pd

train = pd.read_csv("BDA_train.csv")

In [None]:
X = train.drop(columns=['ID', 'withdrawal'], errors='ignore')
y = train['withdrawal']

In [None]:
train = train.drop(columns=['school1', 'contest_award', 'nationality', 'hope_for_group', 'completed_semester', 'project_type','certificate_study_period', 'incumbents_level', 'incumbents_company_level', 'onedayclass_topic', 'previous_class_3','previous_class_4', 'previous_class_5', 'previous_class_6', 'previous_class_7' , 'contest_participitation', 'idea_contest','generation', 'incumbents_lecture_scale_reason', 'interested_company' , 'class1', 'ID' ])

In [None]:
train = train.drop(columns=['class2', 'class3', 'class4', 'major_field', 'incumbents_lecture', 'incumbents_lecture_type', 'incumbents_lecture_scale'])

In [None]:
def map_job(job):
    if pd.isna(job):
        return '기타'
    elif '대학생' in job:
        return '대학생'
    elif '대학원' in job:
        return '대학원생'
    elif '직장' in job:
        return '직장인'
    elif '취준' in job or '준비' in job:
        return '취준생'
    else:
        return '기타'

train['job'] = train['job'].str.strip().str.lower()
train['job_mapped'] = train['job'].apply(map_job)

job_dummies = pd.get_dummies(df['job_mapped'], prefix='job')
train = pd.concat([train, job_dummies], axis=1)
train.drop(['job', 'job_mapped'], axis=1, inplace=True)


In [None]:
def map_time_input(val):
    if val <= 3:
        return 0  # 3시간 이하
    elif val < 6:
        return 1  # 6시간 미만
    else:
        return 2  # 6시간 이상

train['time_input'] = train['time_input'].apply(map_time_input)

In [None]:
def map_reason_category(text):
    text = str(text).lower()

    if any(word in text for word in ['혜택', '가입']):
        return 0
    elif '어려워서' in text:
        return 1
    elif '강의' in text:
        return 2
    elif '시간' in text:
        return 3
    elif '만족' in text:
        return 4
    elif '운영' in text:
        return 5
    else:
        return 6  # 해당 없음

train['whyBDA'] = train['whyBDA'].apply(map_reason_category)


In [None]:
def map_inflow_route(value):
    value = str(value).lower()

    if '에브리타임' in value or '인스타그램' in value or '교내 플랫폼' in value:
        return 0  # SNS / 커뮤니티
    elif '지인' in value or '기존 학회원' in value or '운영진' in value:
        return 1  # 지인 또는 내부
    elif any(x in value for x in ['대외활동', '링커리어', '캠퍼스픽', '캠퍼즈', '위비티']):
        return 2  # 공모전/대외활동 사이트
    elif '검색' in value:
        return 3  # 검색
    else:
        return 4  # 기타/정보없음

train['inflow_route'] = train['inflow_route'].apply(map_inflow_route)

In [None]:
# 복수전공 여부
train['major type'] = train['major type'].map({
    '아니오': 0,
    '예': 1
}).fillna(2).astype(int)

In [None]:
def map_desired_certificate(text):
    if pd.isna(text):
        return 0  # 결측치는 없음 처리

    text = str(text).lower()
    
    # '없음' 관련 키워드 포함 시 0
    none_keywords = ['없음', '아직', '모름', '미정', '모르겠', '불확실', 'x', '-', '고민', '생각없음', '취득 원하지 않음']
    if any(keyword in text for keyword in none_keywords):
        return 0
    
    # 그 외 자격증 언급이 있는 경우 1
    return 1
train['desired_certificate'] = train['desired_certificate'].apply(map_desired_certificate)

In [None]:
def map_desired_job_num(text):
    if pd.isna(text):
        return 9  # 기타

    text = str(text).lower()

    if any(k in text for k in ['데이터 분석', '데이터 사이언티스트', '데이터저널리스트']):
        return 0
    elif any(k in text for k in ['인공지능', '머신러닝', 'ai', 'nlp']):
        return 1
    elif any(k in text for k in ['엔지니어', '반도체', '공정', '설비', '자동차', '품질', '환경안전', '화학공학']):
        return 2
    elif any(k in text for k in ['연구자', '연구원', '자연과학', '사회과학', '문화유산', '대학원']):
        return 3
    elif any(k in text for k in ['경영컨설턴트', '컨설팅', 'pm', '기획', '비즈니스', '애널리스트']):
        return 4
    elif any(k in text for k in ['마케터', '마케팅', '영업']):
        return 5
    elif any(k in text for k in ['소프트웨어 개발자', '웹디자인', 'ux', 'ui']):
        return 6
    elif any(k in text for k in ['금융', '펀드', '애널리틱스']):
        return 7
    elif any(k in text for k in ['인사', 'hr']):
        return 8
    else:
        return 9

train['desired_job_num'] = train['desired_job'].apply(map_desired_job_num)

In [None]:
def map_desired_job_except_data(text):
    if pd.isna(text):
        return 12  # 기타/모르겠음

    text = str(text).lower()

    if any(k in text for k in ['개발', '보안', 'ai', '디지털기획', '정보보안']):
        return 0
    elif any(k in text for k in ['자동차', '반도체']):
        return 1
    elif any(k in text for k in ['바이오']):
        return 2
    elif any(k in text for k in ['금융', '보험', '회계', '세무', '퀀트']):
        return 3
    elif any(k in text for k in ['기획', '전략', '마케팅', '광고', 'md']):
        return 4
    elif any(k in text for k in ['영업']):
        return 5
    elif any(k in text for k in ['물류', '무역']):
        return 6
    elif any(k in text for k in ['인사', 'hr']):
        return 7
    elif any(k in text for k in ['제조', '생산', '품질']):
        return 8
    elif any(k in text for k in ['법무', '사무', '총무']):
        return 9
    elif '공공기관' in text:
        return 10
    elif 'r&d' in text:
        return 11
    elif any(k in text for k in ['없음', '모르겠', '-']):
        return 12
    else:
        return 12

In [None]:
def map_expected_domain(text):
    if pd.isna(text):
        return 10

    text = str(text).lower()

    if any(k in text for k in ['ai', '소프트웨어', '자연어', '이미지 분석']):
        return 0
    elif '금융' in text:
        return 1
    elif any(k in text for k in ['반도체', '제조', '공정', '배터리', '전기차', '철강']):
        return 2
    elif any(k in text for k in ['의료', '헬스', '제약바이오']):
        return 3
    elif any(k in text for k in ['환경', '에너지', '석유화학']):
        return 4
    elif '사회문제' in text:
        return 5
    elif any(k in text for k in ['비즈니스', '경영']):
        return 6
    elif any(k in text for k in ['교통', '물류', '유통']):
        return 7
    elif any(k in text for k in ['스포츠', 'e스포츠', '엔터', '게임', '콘텐츠', '문화예술', '문화콘텐츠', '음반', '웹툰', '패션']):
        return 8
    elif '교육' in text:
        return 9
    elif any(k in text for k in ['없음', '미정', '-', '잘모르겠']):
        return 10
    else:
        return 10
        
train['expected_domain_num'] = train['expected_domain'].apply(map_expected_domain)

In [None]:
def map_major_field(major):
    if pd.isna(major):
        return '기타/없음'
    
    if major == 'IT(컴퓨터 공학 포함)':
        return 'IT'
    elif major == '자연과학':
        return '자연과학'
    elif major == '의약학':
        return '의약학'
    elif major in ['경영학', '경제통상학']:
        return '경영/경제'
    elif major in ['사회과학', '법학', '교육학']:
        return '사회/인문'
    elif major == '인문학':
        return '인문'
    elif major == '예체능':
        return '예체능'
    else:
        return '기타/없음'

train['major1_1_field'] = train['major1_1'].apply(map_major_field)

In [None]:
major_field_mapping = {
    'IT': 0,
    '자연과학': 1,
    '의약학': 2,
    '경영/경제': 3,
    '사회/인문': 4,
    '인문': 5,
    '예체능': 6,
    '기타/없음': 7
}

train['major1_1_field_num'] = train['major1_1_field'].map(major_field_mapping)

In [None]:
def map_major_field_2(major):
    if pd.isna(major) or major == '없음':
        return '기타/없음'
    
    if major == 'IT(컴퓨터 공학 포함)':
        return 'IT'
    elif major == '자연과학':
        return '자연과학'
    elif major == '의약학':
        return '의약학'
    elif major in ['경영학', '경제통상학']:
        return '경영/경제'
    elif major in ['사회과학', '법학', '교육학']:
        return '사회/인문'
    elif major == '인문학':
        return '인문'
    elif major == '예체능':
        return '예체능'
    else:
        return '기타/없음'

train['major1_2_field'] = train['major1_2'].apply(map_major_field_2)

major_field_mapping = {
    'IT': 0,
    '자연과학': 1,
    '의약학': 2,
    '경영/경제': 3,
    '사회/인문': 4,
    '인문': 5,
    '예체능': 6,
    '기타/없음': 7
}

train['major1_2_field_num'] = train['major1_2_field'].map(major_field_mapping)

In [None]:
def cert_binary_mapping(text):
    text = str(text).lower()
    has_cert_words = any(x in text for x in ['컴퓨터활용능력', 'sqld', 'adsp', '정보처리기사', '구글 애널리스트', '태블로', '빅데이터 분석 기사', 'aws', '기타'])
    has_none_words = any(x in text for x in ['준비', '없음'])

    if has_none_words:
        return 0  # 자격증 없음
    elif has_cert_words:
        return 1  # 자격증 있음
    else:
        return 0  # 명확하지 않으면 없음으로 처리

train['certificate_binary'] = train['certificate_acquisition'].apply(cert_binary_mapping)

In [None]:
import pandas as pd

def classify_career_path(text):
    if pd.isna(text):
        return 2  # 미정

    text = str(text).lower().strip()

    # 미정 관련 단어 우선 검사
    if any(x in text for x in ['고민', '미정', '아직']):
        return 2  # 미정

    # 대학원, 취업 포함 여부
    has_univ = '대학원' in text
    has_job = '취업' in text

    # 대학원과 취업 둘 다 포함 시 미정 처리
    if has_univ and has_job:
        return 2  # 미정

    # 창업, 스타트업 포함 여부
    if any(x in text for x in ['창업', '스타트업']):
        return 3  # 창업

    # 기타 키워드 포함 여부
    if any(x in text for x in ['-', '싸피', '재직중', '직장인', '편입', '고시준비']):
        return 4  # 기타

    # 단일 포함 여부 판단
    if has_univ:
        return 1  # 대학원 진학
    if has_job:
        return 0  # 취업

    # 그 외는 기타
    return 4

train['desired_career_path_mapped'] = train['desired_career_path'].apply(classify_career_path)

In [None]:
mapping = {
    '예': 1,
    '아니요': 0
}

train['re_registration_mapped'] = train['re_registration'].map(mapping)

In [None]:
def split_labels(value):
    if value in ['위 4항목 전부', '모두']:
        return ['프로젝트 경험', '데이터 분석 역량', '공모전 경험', '인적 네트워크']
    elif value == '공모전, 프로젝트 경험 둘 다':
        return ['공모전 경험', '프로젝트 경험']
    else:
        return [value]

for label in ['프로젝트 경험', '데이터 분석 역량', '공모전 경험', '인적 네트워크', '자격증 공부', '파이썬 실력', '기타']:
    train['what_to_gain_' + label] = train['what_to_gain'].apply(lambda x: int(label in split_labels(x)))

In [None]:
def map_major_type_reverse(x):
    x = str(x)
    if any(keyword in x for keyword in ['복수', '다중', '이중', '포함']):
        return 0  # 복수 전공 (다중, 이중, 포함)
    if '단일' in x:
        return 1  # 단일 전공
    if pd.isna(x):
        return 2  # 없음
    return -1  # 해당 안 되는 값
     
traindata['major_type_num'] = traindata['major type.1'].apply(map_major_type_reverse)

In [None]:
job_columns = ['job_대학생', 'job_대학원생', 'job_직장인', 'job_취준생']

for col in job_columns:
    subset = traindata[traindata[col] == 1]
    withdrawal_rate = subset['withdrawal'].mean()
    count = subset.shape[0]
    print(f"{col} → 인원수: {count}명, 이탈률: {withdrawal_rate:.3f}")

In [None]:
train = pd.read_csv("BDA_train_전처리.csv")
