참고 : https://dacon.io/competitions/official/235867/codeshare/3888?page=1&dtype=recent

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

# 미리 정의된 스일(테마)를 적용
plt.style.use('seaborn')
sns.set(font_scale=1.5)

# 결측치 데이터를 파악하는데 직관적인 도움을 주는 패키지
import missingno as msno

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [22]:
# 모델링 
from catboost import CatBoostRegressor

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error

import random
import optuna
from optuna.samplers import TPESampler

In [23]:
# 파일 불러오기
df_train = pd.read_csv("./Data/post/train.csv")
df_test = pd.read_csv("./Data/post/test.csv")

In [24]:
# 데이터 확인 : Data Frame 
df_train.sample(5)

Unnamed: 0,index,송하인_격자공간고유번호,수하인_격자공간고유번호,물품_카테고리,운송장_건수
21498,21498,5013000858004300,4122000018100300,농산물,6
24308,24308,5011000078068400,1135000012049100,농산물,5
31421,31421,5011000078068400,4812700029054100,농산물,3
17689,17689,5013000513092400,4723000665078400,농산물,19
14854,14854,5013000582084100,4150000121043400,농산물,5


In [25]:
# 데이터 확인 : Data Info (test) - 결측치 없음 
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31684 entries, 0 to 31683
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         31684 non-null  int64 
 1   송하인_격자공간고유번호  31684 non-null  int64 
 2   수하인_격자공간고유번호  31684 non-null  int64 
 3   물품_카테고리       31684 non-null  object
 4   운송장_건수        31684 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 1.2+ MB


In [26]:
# 데이터 확인 : Data Info (test) - 결측치 없음 
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         7920 non-null   int64 
 1   송하인_격자공간고유번호  7920 non-null   int64 
 2   수하인_격자공간고유번호  7920 non-null   int64 
 3   물품_카테고리       7920 non-null   object
dtypes: int64(3), object(1)
memory usage: 247.6+ KB


#### Data Preprocessing

In [27]:
# 데이터 전처리 : 필요 없는 'index' 처리하기 

df_train.drop(['index'], axis=1, inplace=True)
df_test.drop(['index'], axis=1, inplace= True)

# 확인 
print(df_train.columns)
print(df_test.columns)

Index(['송하인_격자공간고유번호', '수하인_격자공간고유번호', '물품_카테고리', '운송장_건수'], dtype='object')
Index(['송하인_격자공간고유번호', '수하인_격자공간고유번호', '물품_카테고리'], dtype='object')


In [28]:
# column명 변경 
df_train.columns = ["송하인_", "수하인_", "물품", "운송장"]
df_test.columns = ["송하인_", "수하인_", "물품"]

In [29]:
# 데이터 전처리 - 물품_카테고리 (한글 encoding)
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df_train['물품'])

df_train['물품'] = encoder.transform(df_train['물품'])
df_test['물품'] = encoder.transform(df_test['물품'])

In [30]:
# 확인 
df_train.head(3)

Unnamed: 0,송하인_,수하인_,물품,운송장
0,5011000595017300,2871000192069300,67,3
1,4148000690043300,5011000264024400,34,3
2,5011000078068400,1120000007005400,27,3


##### 변수 나누기 
송하인, 수취인이 고유 번호이며 각 위치별로 특징을 가지고 있음 

In [31]:
# 변수 나누는 코드 : slice 해서 사용하려고 했으나, 1-9 숫자로 정해진 것은 아니라 함수를 생성 한듯 

def numround(number, digit):
  num=[]
  while(number!=0):
    num.append(number % 10)
    number = number //10

  return int(num[-digit])

In [32]:
# 변수 : 송하인 (1-5, 6-9, 10, 11-16)나눠서 변수 생성 
# 변수 : 수하인 격자공간고유번호는 자릿수 별로 변수를 생성

for i in tqdm(range(16)):
  df_train[f'송하인_{i+1}'] = 0
  df_train[f'수하인_{i+1}'] = 0
  df_test[f'송하인_{i+1}'] = 0
  df_test[f'수하인_{i+1}'] = 0
  for j in range(df_train.shape[0]):
    df_train.loc[j,f'송하인_{i+1}']=numround(df_train.loc[j,'송하인_'],i+1)
    df_train.loc[j,f'수하인_{i+1}']=numround(df_train.loc[j,'수하인_'],i+1)

  for j in range(df_test.shape[0]):
    df_test.loc[j,f'송하인_{i+1}']=numround(df_test.loc[j,'송하인_'],i+1)
    df_test.loc[j,f'수하인_{i+1}']=numround(df_test.loc[j,'수하인_'],i+1)

100%|██████████| 16/16 [05:12<00:00, 19.54s/it]


In [33]:
# 송하인 구간 나눠주기
df_train["송하인_1~5"]=df_train['송하인_1']+df_train['송하인_2']+df_train['송하인_3']+df_train['송하인_4']+df_train['송하인_5']
df_train['송하인_6~9']=df_train['송하인_6']+df_train['송하인_7']+df_train['송하인_8']+df_train['송하인_9']
df_train['송하인_10']=df_train['송하인_10']
df_train['송하인_11~16']=df_train['송하인_11']+df_train['송하인_12']+df_train['송하인_13']+df_train['송하인_14']+df_train['송하인_15']+df_train['송하인_16']

df_test['송하인_1~5']=df_test['송하인_1']+df_test['송하인_2']+df_test['송하인_3']+df_test['송하인_4']+df_test['송하인_5']
df_test['송하인_6~9']=df_test['송하인_6']+df_test['송하인_7']+df_test['송하인_8']+df_test['송하인_9']
df_test['송하인_10']=df_test['송하인_10']
df_test['송하인_11~16']=df_test['송하인_11']+df_test['송하인_12']+df_test['송하인_13']+df_test['송하인_14']+df_test['송하인_15']+df_test['송하인_16']

In [34]:
# 필요 없는 컬럼 삭제 

df_train.drop(['수하인_','송하인_','송하인_1','송하인_2','송하인_3','송하인_4','송하인_5','송하인_6','송하인_7',
            '송하인_8','송하인_9','송하인_11','송하인_12','송하인_13','송하인_14','송하인_15','송하인_16'],axis=1,inplace=True)
df_test.drop(['수하인_','송하인_','송하인_1','송하인_2','송하인_3','송하인_4','송하인_5','송하인_6','송하인_7',
            '송하인_8','송하인_9','송하인_11','송하인_12','송하인_13','송하인_14','송하인_15','송하인_16'],axis=1,inplace=True)

In [35]:
# 연속형 변수가 아닌 범주형 변수로 변경 

for col in df_test.columns:
  df_train[col]=df_train[col].astype('category')
  df_test[col]=df_test[col].astype('category')

#### Modeling

In [36]:
# model에 사용하기 위한 데이터 

X = df_train.drop(['운송장'],axis=1)
y = df_train['운송장']
X_test = df_test.copy()

In [37]:
# 데이터 확인 

X.head(3)

Unnamed: 0,물품,수하인_1,수하인_2,수하인_3,수하인_4,수하인_5,수하인_6,수하인_7,수하인_8,수하인_9,...,수하인_10,수하인_11,수하인_12,수하인_13,수하인_14,수하인_15,수하인_16,송하인_1~5,송하인_6~9,송하인_11~16
0,67,2,8,7,1,0,0,0,1,9,...,2,0,6,9,3,0,0,7,14,11
1,34,5,0,1,1,0,0,0,2,6,...,4,0,2,4,4,0,0,17,15,10
2,27,1,1,2,0,0,0,0,0,0,...,7,0,0,5,4,0,0,7,7,18


In [38]:
y.head(3)

0    3
1    3
2    3
Name: 운송장, dtype: int64

In [39]:
X_test.head(3)

Unnamed: 0,물품,수하인_1,수하인_2,수하인_3,수하인_4,수하인_5,수하인_6,수하인_7,수하인_8,수하인_9,...,수하인_10,수하인_11,수하인_12,수하인_13,수하인_14,수하인_15,수하인_16,송하인_1~5,송하인_6~9,송하인_11~16
0,47,5,0,1,1,0,0,0,4,3,...,5,0,1,4,1,0,0,18,12,8
1,12,5,0,1,1,0,0,0,1,7,...,2,0,3,4,4,0,0,13,0,5
2,88,5,0,1,1,0,0,0,3,6,...,1,0,9,7,3,0,0,9,9,15
