In [1]:
import matplotlib as plt
plt.rc('axes', unicode_minus=False) # 마이너스 기호 깨짐 방지
# 한글 폰트 설치
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
plt.rc('font', family='NanumBarunGothic') # 폰트 적용

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DATA_PATH = "/content/drive/MyDrive/income_predict/data/"

In [4]:
import numpy as np
import pandas as pd

train = pd.read_csv(f"{DATA_PATH}train_data.csv") # 학습데이터
test = pd.read_csv(f"{DATA_PATH}test_data.csv") # 테스트 데이터

train.shape, test.shape

((20000, 23), (10000, 22))

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
# 연속형 변수, 이산형 변수 추출
train_int = train[train.describe().columns]
train_str = train[train.columns.difference(train.describe().columns)]

test_int = test[test.describe().columns]
test_str = test[test.columns.difference(test.describe().columns)]

train_int.shape, test_int.shape

((20000, 6), (10000, 5))

In [30]:
train_ft = train[~(train['Occupation_Status'] == 'Unknown') &
                 ~(train['Occupation_Status'] == 'Armed Forces')]
test_ft = test[~(test['Occupation_Status'] == 'Unknown')]

train_ft = train_ft.reset_index(drop = True)
test_ft = test_ft.reset_index(drop = True)

# train_ft에 대한 target데이터 생성
target = train_ft['Occupation_Status'].copy()

# test_ft에 대한 target 데이터 (후에 검증을 위해 target을 이용하는 것이다.)
target_tmp = test_ft['Occupation_Status'].copy()

In [31]:
train_ft = train_ft.drop(columns=['ID', 'Occupation_Status','Income'])
test_ft = test_ft.drop(columns=['ID', 'Occupation_Status'])

train_ft.shape,test_ft.shape

((15311, 20), (7684, 20))

In [32]:
# 모델을 통한 학습 진행
# from catboost import CatBoostClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel,SelectKBest,f_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

In [33]:
# 원핫인코딩 실시

from sklearn.preprocessing import OneHotEncoder
cols = ['Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)',
       'Citizenship', 'Education_Status', 'Employment_Status', 'Gender',
       'Hispanic_Origin', 'Household_Status', 'Household_Summary',
       'Industry_Status', 'Martial_Status',
       'Income_Status', 'Race', 'Tax_Status']
enc = OneHotEncoder(handle_unknown = 'ignore')

# 학습 데이터
tmp = pd.DataFrame(
    enc.fit_transform(train_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)

train_ft = pd.concat([train_ft,tmp],axis=1).drop(columns=cols)
# 테스트 데이터
tmp = pd.DataFrame(
    enc.transform(test_ft[cols]).toarray(),
    columns = enc.get_feature_names_out()
)

test_ft = pd.concat([test_ft,tmp],axis=1).drop(columns=cols)

In [34]:
# RobustScaler

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

train_scaled = scaler.fit_transform(train_ft)
test_scaled = scaler.transform(test_ft)

train_data = pd.DataFrame(train_scaled, columns = train_ft.columns)
test_data = pd.DataFrame(test_scaled, columns = test_ft.columns)

train_data.shape,test_data.shape

((15311, 256), (7684, 256))

In [35]:
# 정답 데이터

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_le = le.fit_transform(target)
target_le

array([10, 10,  0, ...,  0, 11,  0])

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=38)

In [57]:
feature_names = train_data.columns.tolist()
feature_names_cleaned = [name.replace('[', '_').replace(']', '_').replace('<', '_') for name in feature_names]

# XGBClassifier를 적용하기 위해 feature의 name을 제가공 해주는 것이다.
train_data.columns = feature_names_cleaned

In [48]:
smote = SMOTE(random_state=42)
train_data_resampled, target_resampled = smote.fit_resample(train_data, target)

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_le = le.fit_transform(target_resampled)
target_le

array([10, 10,  0, ..., 12, 12, 12])

In [62]:
params = {'learning_rate': 0.195402303483054, 'max_depth': 4, 'n_estimators': 258, 'subsample': 0.9524929252371209, 'colsample_bytree': 0.9143575738120333}
xgb_model = XGBClassifier(random_state = 42)

scores = cross_val_score(xgb_model,train_data_resampled,target_le,cv = cv,scoring = 'f1_macro',n_jobs = -1)
np.mean(scores)
# 이 xgb_model을 바탕으로 unknown에 대해서 예측을 시도

0.6704132203537957

In [60]:
lgb_model = LGBMClassifier(random_state = 38)

scores = cross_val_score(lgb_model,train_data_resampled,target_le,cv = cv,scoring = 'f1_macro',n_jobs = -1)
np.mean(scores)

0.6794405516879649

In [63]:
# 예측 시작

train_tmp = train[(train['Occupation_Status'] == 'Unknown')]
test_tmp = test[(test['Occupation_Status'] == 'Unknown')]

# train_ft의 인덱스를 재설정
train_tmp = train_tmp.reset_index(drop = True)

# test_ft에 대한 인덱스를 재설정해주는 것이다.
test_tmp = test_tmp.reset_index(drop = True)

In [64]:
# 여기에 붙여야 함

train_sample = train[(train['Occupation_Status'] == 'Unknown')].copy()
test_sample = test[(test['Occupation_Status'] == 'Unknown')].copy()

# train_ft의 인덱스를 재설정
train_sample = train_sample.reset_index(drop = True)

# test_ft에 대한 인덱스를 재설정해주는 것이다.
test_sample = test_sample.reset_index(drop = True)

In [65]:
train_tmp = train_tmp.drop(columns=['ID', 'Occupation_Status','Income'])
test_tmp = test_tmp.drop(columns=['ID', 'Occupation_Status'])

train_tmp.shape,test_tmp.shape

((4688, 20), (2316, 20))

In [66]:
train_sample = train_sample.drop(columns=['Occupation_Status'])
test_sample = test_sample.drop(columns=['Occupation_Status'])

train_sample.shape,test_sample.shape

((4688, 22), (2316, 21))

In [67]:
# 원핫인코딩 실시

# 학습 데이터
tmp = pd.DataFrame(
    enc.transform(train_tmp[cols]).toarray(),
    columns = enc.get_feature_names_out()
)

train_tmp = pd.concat([train_tmp,tmp],axis=1).drop(columns=cols)
# 테스트 데이터
tmp = pd.DataFrame(
    enc.transform(test_tmp[cols]).toarray(),
    columns = enc.get_feature_names_out()
)

test_tmp = pd.concat([test_tmp,tmp],axis=1).drop(columns=cols)

In [68]:
train_tmp.shape, test_tmp.shape

((4688, 256), (2316, 256))

In [69]:
# RobustScaler

train_scaled = scaler.transform(train_tmp)
test_scaled = scaler.transform(test_tmp)

train_data = pd.DataFrame(train_scaled, columns = train_tmp.columns)
test_data = pd.DataFrame(test_scaled, columns = test_tmp.columns)

train_data.shape,test_data.shape

((4688, 256), (2316, 256))

In [70]:
feature_names = train_data.columns.tolist()
feature_names_cleaned = [name.replace('[', '_').replace(']', '_').replace('<', '_') for name in feature_names]

# XGBClassifier를 적용하기 위해 feature의 name을 제가공 해주는 것이다.
train_data.columns = feature_names_cleaned

In [72]:
lgb_model.fit(train_data_resampled,target_le)

In [73]:
train_pred = lgb_model.predict(train_data)
train_pred

array([12,  7, 10, ..., 10, 10,  0])

In [74]:
feature_names = test_data.columns.tolist()
feature_names_cleaned = [name.replace('[', '_').replace(']', '_').replace('<', '_') for name in feature_names]

# XGBClassifier를 적용하기 위해 feature의 name을 제가공 해주는 것이다.
test_data.columns = feature_names_cleaned

In [76]:
test_pred = lgb_model.predict(test_data)
test_pred

array([ 0, 10,  3, ..., 10, 10,  3])

In [None]:
train_sample['Occupation_Status'] = le.inverse_transform(train_pred)
train_sample.head()

In [78]:
test_sample['Occupation_Status'] = le.inverse_transform(test_pred)
test_sample.head()

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Race,Hispanic_Origin,Martial_Status,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Occupation_Status
0,TEST_0000,79,M,High Junior,Children or Armed Forces,0,Not in universe or children,White,All other,Single,...,Native,US,Unknown,Unknown,Single,0,0,0,Under Median,Admin Support (include Clerical)
1,TEST_0001,47,M,Elementary (5-6),Children or Armed Forces,0,Not in universe or children,White,Other Spanish,Single,...,Native,US,US,US,Nonfiler,0,0,0,Under Median,Services
2,TEST_0004,6,M,Children,Children or Armed Forces,0,Not in universe or children,White,Mexican-American,Single,...,Native,US,US,US,Nonfiler,0,0,0,Under Median,Handlers/Cleaners
3,TEST_0007,11,M,Children,Children or Armed Forces,0,Not in universe or children,White,All other,Single,...,Native,US,US,US,Nonfiler,0,0,0,Under Median,Handlers/Cleaners
4,TEST_0008,71,M,Elementary (5-6),Children or Armed Forces,0,Not in universe or children,White,All other,Married,...,Native,US,US,US,Married Filling Jointly both over 65 (MFJ),0,0,0,Under Median,Transportation


In [79]:
# 하나의 데이터를 생성!
train_ft_2 = train[~(train['Occupation_Status'] == 'Unknown')]
test_ft_2 = test[~(test['Occupation_Status'] == 'Unknown')]

# train_ft의 인덱스를 재설정
train_ft_2 = train_ft_2.reset_index(drop = True)

# train_ft에 대한 target데이터 생성
target = train_ft_2['Occupation_Status'].copy()

# test_ft에 대한 인덱스를 재설정해주는 것이다.
test_ft_2 = test_ft_2.reset_index(drop = True)

# test_ft에 대한 target 데이터 (후에 검증을 위해 target을 이용하는 것이다.)
target_tmp = test_ft_2['Occupation_Status'].copy()

In [80]:
train_ft_2.shape,test_ft_2.shape

((15312, 23), (7684, 22))

In [81]:
train_sample.shape,test_sample.shape

((4688, 23), (2316, 22))

In [82]:
train_ft_2 = train_ft_2.drop(columns=['Occupation_Status'])
test_ft_2 = test_ft_2.drop(columns=['Occupation_Status'])

train_ft_2.shape,test_ft_2.shape

((15312, 22), (7684, 21))

In [83]:
train_ft_2['Occupation_Status'] = target
test_ft_2['Occupation_Status'] = target_tmp

train_ft_2.shape,test_ft_2.shape

((15312, 23), (7684, 22))

In [118]:
train_ft = pd.concat([train_sample,train_ft_2]).sort_values(by = 'ID')
test_ft = pd.concat([test_sample,test_ft_2]).sort_values(by = 'ID')

train_ft.shape,test_ft.shape

((20000, 23), (10000, 22))

In [86]:
train_ft.to_csv(f'{DATA_PATH}train_data_2.csv', index=False)
test_ft.to_csv(f'{DATA_PATH}test_data_2.csv', index=False)

In [120]:
train[train['Industry_Status'] == 'Education']['Employment_Status'].value_counts()

Children or Armed Forces         552
Full-Time                        415
Choice Part-Time                  49
Part-Time (Usually Part-Time)      9
Seeking Part-Time                  8
Seeking Full-Time                  5
Not Working                        2
Part-Time (Usually Full-Time)      1
Name: Employment_Status, dtype: int64

In [117]:
train[train['Industry_Status'] == 'Retail'].groupby('Employment_Status').agg('mean')

  train[train['Industry_Status'] == 'Retail'].groupby('Employment_Status').agg('mean')


Unnamed: 0_level_0,Age,Working_Week (Yearly),Gains,Losses,Dividends,Income
Employment_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Children or Armed Forces,32.235904,41.523655,151.164614,35.063513,62.702528,523.869734
Choice Part-Time,30.153153,41.09009,320.288288,56.81982,55.864865,480.522523
Full-Time,32.518492,42.780939,230.792319,27.273826,83.449502,543.510669
Not Working,27.333333,21.666667,0.0,0.0,0.0,0.0
Part-Time (Usually Full-Time),36.846154,45.692308,363.769231,132.230769,0.0,538.846154
Part-Time (Usually Part-Time),38.5,42.0,0.0,0.0,21.666667,1414.583333
Seeking Full-Time,30.228571,26.114286,84.6,0.0,0.0,143.085714
Seeking Part-Time,27.85,30.25,116.45,0.0,0.0,351.75


In [103]:
# 'Industry_Status'가 'Not in universe or children'이고 'Employment_Status'가 'Children or Armed Forces'인 행 선택
mask = (train_ft['Industry_Status'] == 'Not in universe or children') & (train_ft['Employment_Status'] == 'Children or Armed Forces')

# 선택된 행의 'Industry_Status' 값을 'Children or Armed Forces'로 변경
train_ft.loc[mask, 'Industry_Status'] = 'Children or Armed Forces'

In [108]:
# 'Industry_Status'가 'Not in universe or children'이고 'Employment_Status'가 'Children or Armed Forces'인 행 선택
mask = (train_ft['Industry_Status'] == 'Not in universe or children') & (train_ft['Employment_Status'] == 'Not Working')

# 선택된 행의 'Industry_Status' 값을 'Children or Armed Forces'로 변경
train_ft.loc[mask, 'Industry_Status'] = 'Not Working'

In [114]:
# 'Industry_Status'가 'Not in universe or children'이고 'Employment_Status'가 'Children or Armed Forces'인 행 선택
mask = (train_ft['Industry_Status'] == 'Not in universe or children') & (train_ft['Employment_Status'] == 'Seeking Full-Time')

# 선택된 행의 'Industry_Status' 값을 'Children or Armed Forces'로 변경
train_ft.loc[mask, 'Industry_Status'] = 'Seeking Full-Time'

In [115]:
# 'Industry_Status'가 'Not in universe or children'이고 'Employment_Status'가 'Children or Armed Forces'인 행 선택
mask = (train_ft['Industry_Status'] == 'Not in universe or children') & (train_ft['Employment_Status'] == 'Seeking Part-Time')

# 선택된 행의 'Industry_Status' 값을 'Children or Armed Forces'로 변경
train_ft.loc[mask, 'Industry_Status'] = 'Seeking Part-Time'

In [134]:
# 'Industry_Status'가 'Not in universe or children'이고 'Employment_Status'가 'Children or Armed Forces'인 행 선택
mask = (train_ft['Employment_Status'] == 'Children or Armed Forces') & (train_ft['Age'] < 18)

# 선택된 행의 'Industry_Status' 값을 'Children or Armed Forces'로 변경
train_ft.loc[mask, 'Industry_Status'] = 'Children'

In [135]:
train_ft['Industry_Status'].value_counts()

Retail                                          3149
Children                                        2464
Not in universe or children                     2224
Manufacturing (Durable)                         1575
Manufacturing (Non-durable)                     1223
Education                                       1041
Business & Repair                                847
Medical (except Hospitals)                       838
Construction                                     832
Hospitals                                        821
Finance Insurance & Real Estate                  727
Transportation                                   693
Public Administration                            641
Other professional services                      477
Wholesale                                        450
Personal Services (except Private Household)     429
Social Services                                  367
Entertainment                                    278
Agriculture                                   

In [138]:
train_ft[train_ft['Age'] <= 18]['Employment_Status'].value_counts()

Children or Armed Forces    2734
Full-Time                    367
Not Working                  161
Choice Part-Time              38
Seeking Part-Time             15
Seeking Full-Time              6
Name: Employment_Status, dtype: int64