In [1]:
import numpy as np
import pandas as pd
import torch
import random

random.seed(777)
torch.manual_seed(777)
torch.cuda.manual_seed_all(777)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
label = pd.read_csv('test_label.csv')
submit = pd.read_csv('sample_submit.csv')

In [3]:
train

Unnamed: 0,배출년도,배출월,배출일,시군구명,배출량비율(%),배출횟수,배출횟수비율(%),세대수,남성인구,여성인구,외국인인구,배출량(kg)
0,2018,4,1,화성시,4.50,48118,4.21,283903.0,373610,348610,37023,102691.10
1,2018,4,1,포천시,4.20,3881,3.77,69028.0,80120,72190,12189,8113.30
2,2018,4,1,평택시,4.25,4326,3.88,208004.0,249525,237215,21774,9130.35
3,2018,4,1,파주시,3.94,14968,3.66,182723.0,224181,217436,10667,29605.85
4,2018,4,1,이천시,4.38,7491,3.98,87948.0,109076,104172,6854,18488.45
...,...,...,...,...,...,...,...,...,...,...,...,...
17011,2019,12,31,남양주시,2.83,8274,2.87,276656.0,349664,352166,7477,15732.85
17012,2019,12,31,군포시,2.82,3334,2.92,110005.0,136878,138974,7010,5522.85
17013,2019,12,31,구리시,3.41,5831,3.01,79870.0,98968,100297,1490,24028.00
17014,2019,12,31,과천시,3.02,175,3.16,21467.0,28638,29651,173,227.70


In [4]:
from sklearn.preprocessing import LabelEncoder
classle = LabelEncoder()
if test['시군구명'].dtype == 'object' and train['시군구명'].dtype == 'object':
    classle.fit(pd.concat([train['시군구명'], test['시군구명']]))
    train['시군구명'] = classle.transform(train['시군구명'])
    test['시군구명'] = classle.transform(test['시군구명'])

수정 및추가된 부분
====

일자별 음식물쓰레기 배출량에 있어서 중요한 것은 주기성에 관한 feature들을 남기는 것이 더 효과적인 예측을 할 수 있다고 생각하였습니다. 그래서 배출년도에 관한 데이터는 오히려 정확한 예측을 방해한다고 판단하였기 때문에 train 및 test에서 제거하였습니다.

In [5]:
train_y = train['배출량(kg)']
train_x = train.drop(['배출량(kg)','배출년도'], axis =1)
test_x = test.drop(['배출년도'], axis =1)

In [6]:
train.시군구명.unique()

array([27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 12, 10, 13, 11,  8,
        9,  7,  6,  5,  4,  3,  2,  0, 14,  1, 26])

추가된 부분
====
각 시군별 데이터 중에서 특정 날의 데이터셋으로는 알 수 없는 요인들로 인해서 특정 날의 배출과 관련된 데이터가 변할 수 있다고 생각했기 때문에 각 시군별 데이터 중 너무 많이 배출되었거나 너무 적게 배출된 경우의 데이터를 제거하여 학습을 진행할 경우 더욱 정확한 예측이 가능하다고 판단하였기 때문에 다음과 같은 코드를 작성하였습니다.

어느 정도까지 outlier로 보고 제거를 할 것인지 직접 범위를 바꾸면서 예측결과를 보면서 진행하였습니다.  
1. 25/75
2. 10/90
3. 5/95

이렇게 세가지 경우로 진행한 결과 2번째 경우에서 가장 높은 성능 개선이 나타나 2번 값을 가지고 이상치 제거를 진행 한 후 학습을 하였습니다.

In [7]:
def outlier_iqr(data):
  q1, q3 = np.percentile(data, [10, 90])
  iqr = q3-q1
  lower = q1- (iqr*1.5)
  upper = q3 + (iqr*1.5)  
  return np.where((data > upper) | (data < lower))

In [8]:
outlier0 = outlier_iqr(train_x.loc[train_x['시군구명']==0]['배출량비율(%)'])[0]

In [9]:
for i in range (1,28):
    outlier1 = outlier_iqr(train_x.loc[train_x['시군구명']==i]['배출량비율(%)'])[0]
    outlier0 = np.concatenate([outlier0, outlier1], axis=0)

In [10]:
outlier2 = outlier_iqr(train_x.loc[train_x['시군구명']==0]['배출횟수비율(%)'])[0]

In [11]:
for i in range (1,28):
    outlier3 = outlier_iqr(train_x.loc[train_x['시군구명']==i]['배출횟수비율(%)'])[0]
    outlier2 = np.concatenate([outlier2, outlier3], axis=0)

In [12]:
outlier4 = outlier_iqr(train_x.loc[train_x['시군구명']==0]['배출횟수'])[0]

In [13]:
for i in range (1,28):
    outlier5 = outlier_iqr(train_x.loc[train_x['시군구명']==i]['배출횟수'])[0]
    outlier4 = np.concatenate([outlier4, outlier5], axis=0)

In [14]:
outlier_index = set(np.concatenate([outlier0, outlier2, outlier4], axis=0))

In [15]:
for i in outlier_index:
  train_x = train_x.drop(index=i, axis = 0)
  train_y= train_y.drop(index=i, axis = 0)

In [16]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train_x_sc = sc.fit_transform(train_x)
test_x_sc= sc.transform(test_x)

In [17]:
train_x_tensor = torch.FloatTensor(np.array(train_x_sc)).to(device)
test_x_tensor = torch.FloatTensor(np.array(test_x_sc)).to(device)
train_y_tensor = torch.FloatTensor(np.array(train_y.values).reshape(-1,1)).to(device)

#train_x_tensor = torch.FloatTensor(np.array(train_x)).to(device)
#test_x_tensor = torch.FloatTensor(np.array(test)).to(device)
#train_y_tensor = torch.FloatTensor(np.array(train_y.values).reshape(-1,1)).to(device)

In [18]:
class NN(torch.nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        self.linear1 = torch.nn.Linear(10, 56, bias = True)
        self.linear2 = torch.nn.Linear(56, 56, bias = True)
        self.linear3 = torch.nn.Linear(56, 56, bias = True)
        self.linear4 = torch.nn.Linear(56, 56, bias = True)
        self.linear5 = torch.nn.Linear(56, 56, bias = True)
        self.linear6 = torch.nn.Linear(56, 56, bias = True)
        self.linear7 = torch.nn.Linear(56, 1, bias = True)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.relu(out)
        out = self.linear3(out)
        out = self.relu(out)
        out = self.linear4(out)
        out = self.relu(out)
        out = self.linear5(out)
        out = self.relu(out)
        out = self.linear6(out)
        out = self.relu(out)
        out = self.linear7(out)
        return out
    
model = NN().to(device)

In [19]:
loss = torch.nn.MSELoss().to(device) #회귀 이므로 mseloss 사용
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [20]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [21]:
from sklearn.metrics import r2_score

model.train()
train_epoch = 5000
for epoch in range(1,train_epoch + 1):
    optimizer.zero_grad()
    hypothesis = model(train_x_tensor)
    
    cost = loss(hypothesis, train_y_tensor)
    cost.backward()
    optimizer.step()
    if epoch % 1000 == 0:
        model.eval()
        with torch.no_grad():
            test_prediction = model(test_x_tensor)
            train_prediction = model(train_x_tensor)
            accuracy = r2_score(train_y, train_prediction.detach().cpu())
            accuracy2 = r2_score(label['배출량(kg)'], test_prediction.detach().cpu())
            RMSE_train = mean_absolute_error(train_y, train_prediction.detach().cpu())
            RMSE_test = mean_absolute_error(label['배출량(kg)'], test_prediction.detach().cpu())
            print('train_acc:', '{:.9f}'.format(accuracy), ' val_acc:','{:.9f}'.format(accuracy2), 'train_rmse:', '{:.9f}'.format(RMSE_train), ' val_rmse:','{:.9f}'.format(RMSE_test))

train_acc: 0.993187643  val_acc: 0.987498059 train_rmse: 1290.353835903  val_rmse: 1971.287116375
train_acc: 0.997285212  val_acc: 0.992448728 train_rmse: 723.796098453  val_rmse: 1413.000001221
train_acc: 0.998344891  val_acc: 0.993853318 train_rmse: 599.186779755  val_rmse: 1267.825769741
train_acc: 0.998691648  val_acc: 0.993421230 train_rmse: 552.010905380  val_rmse: 1310.843645533
train_acc: 0.998877684  val_acc: 0.994644118 train_rmse: 508.370121208  val_rmse: 1165.956021744


In [22]:
model.eval()
with torch.no_grad():
    prediction = model(test_x_tensor)

In [23]:
submit['배출량(kg)'] = prediction.detach().cpu().numpy()

In [24]:
submit

Unnamed: 0.1,Unnamed: 0,배출량(kg)
0,0,94268.000000
1,1,2277.103027
2,2,6252.050293
3,3,7408.812988
4,4,25426.091797
...,...,...
7445,7445,21827.689453
7446,7446,7387.768555
7447,7447,24365.363281
7448,7448,402.174866


In [25]:
submit.to_csv('Baseline_DNN2.csv', index= False) #2429