# **🏠 부동산 실거래가 Team 3 DTQ First Trial code**

## Contents



## Step 1: Library Imports
- 필요한 라이브러리를 불러옵니다.

In [1]:
!pip install eli5==0.13.0

# 한글 폰트 사용을 위한 라이브러리입니다.
!apt-get install -y fonts-nanum

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Ensure the torch is using GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

## Step 2: Data Preprocessing Functions

In [15]:
def one_hot_encode(df, columns):
    encoder = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(encoder.fit_transform(df[columns]), columns=encoder.get_feature_names_out(columns))
    return pd.concat([df.drop(columns, axis=1), encoded_df], axis=1)

In [16]:
def impute_missing_values(df, strategy='median'):
    imputer = SimpleImputer(strategy=strategy)
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df


In [17]:
def smooth_ridit_transform(df, columns):
    for col in columns:
        sorted_col = df[col].sort_values()
        rank = sorted_col.rank(pct=True)
        ridit_score = 2 * rank - 1
        df[col] = ridit_score
    return df


In [18]:
def bin_numerical_variables(df, columns, n_bins=5):
    discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    for col in columns:
        df[col + '_bin'] = discretizer.fit_transform(df[[col]]).astype(int)
    return df


In [19]:
def tfidf_transform(df, text_column):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    return tfidf_df


## Step 3: Custom Dataset Class for PyTorch

In [20]:
class RealEstateDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]


## Step 4: Define the PyTorch Model

In [21]:
class SimpleResidualNetwork(nn.Module):
    def __init__(self, input_dim):
        super(SimpleResidualNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)
        self.residual = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        out = self.relu(self.fc1(x))
        out = self.fc2(out) + self.residual(x)
        return out


## Step 5: Training and Inference Functions

In [22]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=25):
    model.train()
    rmse_history = []
    
    for epoch in range(num_epochs):
        epoch_losses = []
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            epoch_losses.append(torch.sqrt(loss).item())
        
        epoch_rmse = np.mean(epoch_losses)
        rmse_history.append(epoch_rmse)
        print(f"Epoch [{epoch+1}/{num_epochs}], RMSE: {epoch_rmse:.4f}")
    
    return model, rmse_history

def plot_rmse(rmse_history):
    plt.figure(figsize=(10, 5))
    plt.plot(rmse_history, label='RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.title('RMSE over Epochs')
    plt.legend()
    plt.show()

In [23]:
def predict(model, dataloader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, _ in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predictions.extend(outputs.cpu().numpy())
    return np.array(predictions)


## Step 6: Putting It All Together

In [24]:
# Load train and test data
train_data = pd.read_csv('/data/ephemeral/home/train.csv')
test_data = pd.read_csv('/data/ephemeral/home/test.csv')

# Select only the required columns
selected_columns = ['시군구', '번지', '아파트명', '전용면적(㎡)', '계약년월', '건축년도', 'target']
train_data = train_data[selected_columns]
test_data = test_data[['시군구', '번지', '아파트명', '전용면적(㎡)', '계약년월', '건축년도']]

# Data preprocessing
train_data = one_hot_encode(train_data, columns=['시군구', '번지', '아파트명'])
train_data = impute_missing_values(train_data)
train_data = smooth_ridit_transform(train_data, columns=['전용면적(㎡)', '계약년월', '건축년도'])

test_data = one_hot_encode(test_data, columns=['시군구', '번지', '아파트명'])
test_data = impute_missing_values(test_data)
test_data = smooth_ridit_transform(test_data, columns=['전용면적(㎡)', '계약년월', '건축년도'])

# Prepare dataset for PyTorch
train_features = train_data.drop(columns=['target']).values
train_targets = train_data['target'].values
train_dataset = RealEstateDataset(train_features, train_targets)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_features = test_data.values
test_dataset = RealEstateDataset(test_features)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model, criterion, and optimizer
model = SimpleResidualNetwork(input_dim=train_features.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
trained_model, rmse_history = train_model(model, train_dataloader, criterion, optimizer, num_epochs=25)

# Plot RMSE history
plot_rmse(rmse_history)

# Make predictions
predictions = predict(trained_model, test_dataloader)

# Save predictions
np.savetxt('predictions.csv', predictions, delimiter=',')



: 