# Data

In [1]:
import pandas as pd
import re

df = pd.read_csv('/Users/baby_cow/Desktop/AI/ML_competition/data/geocoded_addresses_5.csv') # 데이터 가져오기

# 데이터 전처리
df['계약년'] = df['계약년월'].map(lambda x : int(str(x)[:4]))
df['계약월'] = df['계약년월'].map(lambda x : int(str(x)[4:]))
df['전용면적'] = df['전용면적(㎡)']
df = df.drop(['계약년월', '전용면적(㎡)', '본번', '부번', 'k-단지분류(아파트,주상복합등등)', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형', 'k-난방방식', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '단지신청일', 'address', 'address_apt', 'address_apt_sqm' ], axis=1)
df = df.rename(columns=lambda x: re.sub('k-', '', x))

for col in df.columns:
    if df[col].dtype == 'object':
        df = df.drop(columns=[col], axis=1)

train_df = df.query("is_test == 0")
test_df = df.query("is_test == 1")

train_df = train_df.drop(columns=['is_test'], axis=1)
test_df = test_df.drop(columns=['is_test'], axis=1)

# Model

In [2]:
import lightgbm
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import random
import os

# 시드 고정 함수
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    lightgbm.seed = seed

set_seed(42)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [3]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Initialize and train the model with early stopping and verbose evaluation
model = lgb.LGBMRegressor(**params, n_estimators=5000, random_state=42)

# Train & Test

In [4]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['target'], axis=1)
y = train_df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train,
            eval_set = [(X_val, y_val)],
            callbacks = [lgb.log_evaluation(1), lgb.early_stopping(10)])

pred = model.predict(X_val)

mse = mean_squared_error(y_val, pred)
rmse = np.sqrt(mse)

print(rmse)

[1]	valid_0's rmse: 46606.3
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 46564.5
[3]	valid_0's rmse: 46522.7
[4]	valid_0's rmse: 46480.9
[5]	valid_0's rmse: 46439.3
[6]	valid_0's rmse: 46403.1
[7]	valid_0's rmse: 46363.4
[8]	valid_0's rmse: 46321.9
[9]	valid_0's rmse: 46280.4
[10]	valid_0's rmse: 46238.9
[11]	valid_0's rmse: 46197.5
[12]	valid_0's rmse: 46156.2
[13]	valid_0's rmse: 46114.9
[14]	valid_0's rmse: 46073.8
[15]	valid_0's rmse: 46032.6
[16]	valid_0's rmse: 45991.4
[17]	valid_0's rmse: 45950.3
[18]	valid_0's rmse: 45909.3
[19]	valid_0's rmse: 45868.2
[20]	valid_0's rmse: 45827.2
[21]	valid_0's rmse: 45786.1
[22]	valid_0's rmse: 45745.1
[23]	valid_0's rmse: 45704.1
[24]	valid_0's rmse: 45663.5
[25]	valid_0's rmse: 45624.7
[26]	valid_0's rmse: 45584.1
[27]	valid_0's rmse: 45545.6
[28]	valid_0's rmse: 45505.1
[29]	valid_0's rmse: 45464.6
[30]	valid_0's rmse: 45424.2
[31]	valid_0's rmse: 45383.7
[32]	valid_0's rmse: 45345.2
[33]	valid_0's rmse:

# Create submit csv

In [6]:
test_df = test_df.drop(columns=['target'], axis=1)

In [7]:
test_pred = model.predict(test_df)

submit = pd.DataFrame(columns=['target'], data=test_pred)
submit['target'] = submit['target'].astype(int)
submit

Unnamed: 0,target
0,110342
1,137394
2,137726
3,126565
4,111538
...,...
9267,60917
9268,60683
9269,64018
9270,60794


In [8]:
submit.to_csv('submit.csv', index=False)