### Regularized Linear Regression Task
- 다이아몬드의 특성을 회귀 분석하여 가격을 예측한다.

In [1]:
import pandas as pd
import numpy as np

d_df = pd.read_csv('./datasets/diamond.csv')
d_df

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,53940,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53940,53941,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53941,53942,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [2]:
# 불필요한 feature 제거
pre_d_df = d_df.drop(labels=['Unnamed: 0'], axis=1)
pre_d_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53940,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53941,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [3]:
pre_d_df.loc[:, 'target'] = pre_d_df.price
pre_d_df = pre_d_df.drop(labels=['price'], axis=1)
pre_d_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...,...,...,...
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757
53939,0.75,Ideal,D,SI2,62.2,55.0,5.83,5.87,3.64,2757
53940,0.71,Premium,E,SI1,60.5,55.0,5.79,5.74,3.49,2756
53941,0.71,Premium,F,SI1,59.8,62.0,5.74,5.73,3.43,2756


In [6]:
# 레이블 인코딩 할 컬럼들만 따로 분리
label_columns = ['cut', 'color', 'clarity']

label_df = pre_d_df.loc[:, label_columns]
label_df

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2
...,...,...,...
53938,Premium,H,SI2
53939,Ideal,D,SI2
53940,Premium,E,SI1
53941,Premium,F,SI1


In [8]:
# 레이블 인코딩 할 feature들 drop
pre_d_df = pre_d_df.drop(labels=label_columns, axis=1)
pre_d_df

Unnamed: 0,carat,depth,table,x,y,z,target
0,0.23,61.5,55.0,3.95,3.98,2.43,326
1,0.21,59.8,61.0,3.89,3.84,2.31,326
2,0.23,56.9,65.0,4.05,4.07,2.31,327
3,0.29,62.4,58.0,4.20,4.23,2.63,334
4,0.31,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...
53938,0.86,61.0,58.0,6.15,6.12,3.74,2757
53939,0.75,62.2,55.0,5.83,5.87,3.64,2757
53940,0.71,60.5,55.0,5.79,5.74,3.49,2756
53941,0.71,59.8,62.0,5.74,5.73,3.43,2756


In [9]:
from sklearn.preprocessing import StandardScaler

# 표준화 된 수치가 담긴 데이터 세트를 새로 생성
std = StandardScaler()
result = std.fit_transform(pre_d_df)
std_pre_d_df = pd.DataFrame(result, columns=pre_d_df.columns)
std_pre_d_df

Unnamed: 0,carat,depth,table,x,y,z,target
0,-1.198189,-0.174033,-1.099673,-1.587882,-1.536239,-1.571166,-0.904102
1,-1.240384,-1.360676,1.585457,-1.641372,-1.658821,-1.741217,-0.904102
2,-1.198189,-3.384949,3.375544,-1.498733,-1.457436,-1.741217,-0.903851
3,-1.071605,0.454189,0.242892,-1.365010,-1.317342,-1.287749,-0.902096
4,-1.029411,1.082412,0.242892,-1.240202,-1.212272,-1.117699,-0.901846
...,...,...,...,...,...,...,...
53938,0.130941,-0.523046,0.242892,0.373393,0.337515,0.285218,-0.294722
53939,-0.101129,0.314584,-1.099673,0.088116,0.118619,0.143509,-0.294722
53940,-0.185518,-0.872059,-1.099673,0.052457,0.004793,-0.069054,-0.294973
53941,-0.185518,-1.360676,2.032979,0.007882,-0.003963,-0.154079,-0.294973


In [10]:
condition = True

# 반복문으로 각 컬럼에서 -1.96 ~ 1.96 사이의 데이터만 가져오는 조건식 생성
for column in std_pre_d_df.columns:
    condition &= std_pre_d_df[column].between(-1.96, 1.96)

# 위 조건에 맞지 않는 데이터 삭제
std_pre_d_df = std_pre_d_df[condition]
std_pre_d_df

Unnamed: 0,carat,depth,table,x,y,z,target
0,-1.198189,-0.174033,-1.099673,-1.587882,-1.536239,-1.571166,-0.904102
1,-1.240384,-1.360676,1.585457,-1.641372,-1.658821,-1.741217,-0.904102
3,-1.071605,0.454189,0.242892,-1.365010,-1.317342,-1.287749,-0.902096
4,-1.029411,1.082412,0.242892,-1.240202,-1.212272,-1.117699,-0.901846
5,-1.177092,0.733399,-0.204630,-1.596797,-1.553750,-1.500312,-0.901595
...,...,...,...,...,...,...,...
53937,-0.206616,0.733399,1.137935,-0.063437,-0.047743,0.030142,-0.294722
53938,0.130941,-0.523046,0.242892,0.373393,0.337515,0.285218,-0.294722
53939,-0.101129,0.314584,-1.099673,0.088116,0.118619,0.143509,-0.294722
53940,-0.185518,-0.872059,-1.099673,0.052457,0.004793,-0.069054,-0.294973


In [12]:
# pre_d_df에서 이상치가 없는 행만 유지하고, 인덱스 초기화
pre_d_df = pre_d_df.iloc[std_pre_d_df.index].reset_index(drop=True)
pre_d_df

Unnamed: 0,carat,depth,table,x,y,z,target
0,0.23,61.5,55.0,3.95,3.98,2.43,326
1,0.21,59.8,61.0,3.89,3.84,2.31,326
2,0.29,62.4,58.0,4.20,4.23,2.63,334
3,0.31,63.3,58.0,4.34,4.35,2.75,335
4,0.24,62.8,57.0,3.94,3.96,2.48,336
...,...,...,...,...,...,...,...
44934,0.70,62.8,60.0,5.66,5.68,3.56,2757
44935,0.86,61.0,58.0,6.15,6.12,3.74,2757
44936,0.75,62.2,55.0,5.83,5.87,3.64,2757
44937,0.71,60.5,55.0,5.79,5.74,3.49,2756


In [13]:
# 레이블 인코딩 할 데이터가 들어있는 데이터 세트에도 똑같이 적용
label_df = label_df.iloc[std_pre_d_df.index].reset_index(drop=True)
label_df

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Premium,I,VS2
3,Good,J,SI2
4,Very Good,J,VVS2
...,...,...,...
44934,Very Good,D,SI1
44935,Premium,H,SI2
44936,Ideal,D,SI2
44937,Premium,E,SI1


In [14]:
pre_d_df = pd.concat([label_df, pre_d_df], axis=1)
pre_d_df

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,target
0,Ideal,E,SI2,0.23,61.5,55.0,3.95,3.98,2.43,326
1,Premium,E,SI1,0.21,59.8,61.0,3.89,3.84,2.31,326
2,Premium,I,VS2,0.29,62.4,58.0,4.20,4.23,2.63,334
3,Good,J,SI2,0.31,63.3,58.0,4.34,4.35,2.75,335
4,Very Good,J,VVS2,0.24,62.8,57.0,3.94,3.96,2.48,336
...,...,...,...,...,...,...,...,...,...,...
44934,Very Good,D,SI1,0.70,62.8,60.0,5.66,5.68,3.56,2757
44935,Premium,H,SI2,0.86,61.0,58.0,6.15,6.12,3.74,2757
44936,Ideal,D,SI2,0.75,62.2,55.0,5.83,5.87,3.64,2757
44937,Premium,E,SI1,0.71,60.5,55.0,5.79,5.74,3.49,2756


In [15]:
from sklearn.preprocessing import LabelEncoder

label_columns = ['cut', 'color', 'clarity']
label_encoders = {}

# 위 컬럼들에 레이블 인코더 적용
for column in label_columns:
    encoder = LabelEncoder()
    result = encoder.fit_transform(pre_d_df[column])
    label_encoders[column] = encoder.classes_
    pre_d_df[column] = result

label_encoders

{'cut': array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
 'color': array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
 'clarity': array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
       dtype=object)}

In [16]:
pre_d_df

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z,target
0,2,1,3,0.23,61.5,55.0,3.95,3.98,2.43,326
1,3,1,2,0.21,59.8,61.0,3.89,3.84,2.31,326
2,3,5,5,0.29,62.4,58.0,4.20,4.23,2.63,334
3,1,6,3,0.31,63.3,58.0,4.34,4.35,2.75,335
4,4,6,7,0.24,62.8,57.0,3.94,3.96,2.48,336
...,...,...,...,...,...,...,...,...,...,...
44934,4,0,2,0.70,62.8,60.0,5.66,5.68,3.56,2757
44935,3,4,3,0.86,61.0,58.0,6.15,6.12,3.74,2757
44936,2,0,3,0.75,62.2,55.0,5.83,5.87,3.64,2757
44937,3,1,2,0.71,60.5,55.0,5.79,5.74,3.49,2756


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 데이터 세트 분할 후 선형 회귀 모델로 훈련
features, targets = pre_d_df.iloc[:, :-1], pre_d_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [22]:
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

# 선형 회귀 모델의 평가 지표(R2 점수 포함)를 출력해주는 함수
def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    # MSLE = mean_squared_log_error(y_test, prediction)
    # RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, R2))

In [23]:
# 테스트 데이터 예측 후 평가
prediction = l_r.predict(X_test)

get_evaluation(y_test, prediction)

MSE: 798706.2408, RMSE: 893.7037, R2: 0.8893
