In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# 노트북 안에 그래프를 그리기 위해
%matplotlib inline

# 그래프에서 격자로 숫자 범위가 눈에 잘 띄도록 ggplot 스타일을 사용
plt.style.use('ggplot')

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings('ignore')

In [2]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [3]:
# 날짜 처리
data = pd.read_csv('201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [4]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

## 정제한 데이터 프레임 내보내기

In [5]:
df.to_csv("data_filter.csv", encoding='utf-8-sig')

## 새로 만든애로 불러오기(훨씬빠름)

In [7]:
df = pd.read_csv("data_filter.csv")

In [8]:
from sklearn.preprocessing import LabelEncoder
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

## 명목형 변수 카테고리화 시키기

In [9]:
categorical_features = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']

for i in categorical_features:
    df_num[i] = df_num[i].astype('category')

In [10]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057394 entries, 0 to 1057393
Data columns (total 12 columns):
 #   Column        Non-Null Count    Dtype   
---  ------        --------------    -----   
 0   Unnamed: 0    1057394 non-null  int64   
 1   CARD_SIDO_NM  1057394 non-null  category
 2   STD_CLSS_NM   1057394 non-null  category
 3   HOM_SIDO_NM   1057394 non-null  category
 4   AGE           1057394 non-null  category
 5   SEX_CTGO_CD   1057394 non-null  category
 6   FLC           1057394 non-null  category
 7   year          1057394 non-null  category
 8   month         1057394 non-null  category
 9   CSTMR_CNT     1057394 non-null  int64   
 10  AMT           1057394 non-null  int64   
 11  CNT           1057394 non-null  int64   
dtypes: category(8), int64(4)
memory usage: 40.3 MB


In [9]:
# feature, target 설정
train_num = df_num.sample(frac=1, random_state=0)
X_train = train_num.drop([ 'AMT', 'CNT','CSTMR_CNT'], axis=1)
y_train = train_num['AMT']

In [11]:
# 예측 템플릿 만들기
CARD_SIDO_NMs = df_num['CARD_SIDO_NM'].unique()
STD_CLSS_NMs  = df_num['STD_CLSS_NM'].unique()
HOM_SIDO_NMs  = df_num['HOM_SIDO_NM'].unique()
AGEs          = df_num['AGE'].unique()
SEX_CTGO_CDs  = df_num['SEX_CTGO_CD'].unique()
FLCs          = df_num['FLC'].unique()
years         = [2020]
months        = [4, 7]

temp = []
for CARD_SIDO_NM in CARD_SIDO_NMs:
    for STD_CLSS_NM in STD_CLSS_NMs:
        for HOM_SIDO_NM in HOM_SIDO_NMs:
            for AGE in AGEs:
                for SEX_CTGO_CD in SEX_CTGO_CDs:
                    for FLC in FLCs:
                        for year in years:
                            for month in months:
                                temp.append([CARD_SIDO_NM, STD_CLSS_NM, HOM_SIDO_NM, AGE, SEX_CTGO_CD, FLC, year, month])
temp = np.array(temp)
temp = pd.DataFrame(data=temp, columns=X_train.columns)

In [12]:
data.head()

Unnamed: 0,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,year,month
0,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4,2019,1
1,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,1,2,7,1374500,8,2019,1
2,강원,강릉시,건강보조식품 소매업,강원,강릉시,30s,2,2,6,818700,6,2019,1
3,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,3,4,1717000,5,2019,1
4,강원,강릉시,건강보조식품 소매업,강원,강릉시,40s,1,4,3,1047300,3,2019,1


In [13]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month
0,0,0,0,1,1,1,2020,4
1,0,0,0,1,1,1,2020,7
2,0,0,0,1,1,2,2020,4
3,0,0,0,1,1,2,2020,7
4,0,0,0,1,1,3,2020,4
...,...,...,...,...,...,...,...,...
1658855,16,30,14,0,2,3,2020,7
1658856,16,30,14,0,2,4,2020,4
1658857,16,30,14,0,2,4,2020,7
1658858,16,30,14,0,2,5,2020,4


## CNT 예측먼저

In [14]:
X_train = train_num.drop([ 'AMT', 'CNT','CSTMR_CNT'], axis=1)
y_train = train_num['CNT']

In [16]:
y_train.value_counts()

3         81540
4         49625
6         31018
5         28620
7         27672
          ...  
51752         1
55850         1
180096        1
376672        1
16884         1
Name: CNT, Length: 40062, dtype: int64

In [18]:
y_train

149245        5
554956        6
918516    15027
425414      164
640949        9
          ...  
359783      530
152315        5
963395       15
117952        6
305711       11
Name: CNT, Length: 1057394, dtype: object

In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1057394 entries, 149245 to 305711
Data columns (total 8 columns):
 #   Column        Non-Null Count    Dtype   
---  ------        --------------    -----   
 0   CARD_SIDO_NM  1057394 non-null  category
 1   STD_CLSS_NM   1057394 non-null  category
 2   HOM_SIDO_NM   1057394 non-null  category
 3   AGE           1057394 non-null  category
 4   SEX_CTGO_CD   1057394 non-null  category
 5   FLC           1057394 non-null  category
 6   year          1057394 non-null  category
 7   month         1057394 non-null  category
dtypes: category(8)
memory usage: 16.1 MB


## test에 CNT 학습해 넣기

In [21]:
from sklearn.ensemble import RandomForestRegressor

y_train = y_train.astype('str')

lr_clf = RandomForestRegressor()

lr_clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [22]:
pred = lr_clf.predict(temp)

In [23]:
pred

array([ 3.62,  3.37,  7.71, ..., 15.12, 14.83, 15.36])

In [24]:
len(pred)

1658860

In [25]:
len(temp)

1658860

In [33]:
temp['CNT'] = np.round(pred,0).astype('int64')

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24697792 entries, 0 to 24697791
Data columns (total 13 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   CARD_SIDO_NM  object
 1   CARD_CCG_NM   object
 2   STD_CLSS_NM   object
 3   HOM_SIDO_NM   object
 4   HOM_CCG_NM    object
 5   AGE           object
 6   SEX_CTGO_CD   int64 
 7   FLC           int64 
 8   CSTMR_CNT     int64 
 9   AMT           int64 
 10  CNT           int64 
 11  year          int64 
 12  month         int64 
dtypes: int64(7), object(6)
memory usage: 2.4+ GB


In [38]:
temp.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CNT
0,0,0,0,1,1,1,2020,4,4
1,0,0,0,1,1,1,2020,7,3
2,0,0,0,1,1,2,2020,4,8
3,0,0,0,1,1,2,2020,7,6
4,0,0,0,1,1,3,2020,4,10


## 이번엔 CSTMR? 이거 예측

In [39]:
X_train = train_num.drop([ 'AMT','CSTMR_CNT'], axis=1)
y_train = train_num['CSTMR_CNT']

In [40]:
from sklearn.ensemble import RandomForestRegressor

y_train = y_train.astype('str')

lr_clf2 = RandomForestRegressor()

lr_clf2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [41]:
pred2 = lr_clf2.predict(temp)

In [42]:
pred2

array([ 3.82,  3.  ,  7.59, ..., 21.06, 20.42, 20.28])

In [43]:
pred2 = np.round(pred2, 0)

In [44]:
pred2

array([ 4.,  3.,  8., ..., 21., 20., 20.])

In [45]:
temp['CSTMR_CNT'] = pred2

In [47]:
temp['CSTMR_CNT'] = temp['CSTMR_CNT'].astype('int64')

In [48]:
temp

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CNT,CSTMR_CNT
0,0,0,0,1,1,1,2020,4,4,4
1,0,0,0,1,1,1,2020,7,3,3
2,0,0,0,1,1,2,2020,4,8,8
3,0,0,0,1,1,2,2020,7,6,5
4,0,0,0,1,1,3,2020,4,10,8
...,...,...,...,...,...,...,...,...,...,...
1658855,16,30,14,0,2,3,2020,7,12,12
1658856,16,30,14,0,2,4,2020,4,15,21
1658857,16,30,14,0,2,4,2020,7,15,21
1658858,16,30,14,0,2,5,2020,4,15,20


## 만든 예측 탬플릿

In [49]:
temp.to_csv("submission_template.csv", encoding='utf-8-sig')