In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

train_data_path = '/home/gram/25-2-ML-final-report/data/train_processed.csv'
test_data_path = '/home/gram/25-2-ML-final-report/data/test_processed.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Education Level,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,...,Occupation_Self-Employed,Occupation_Unemployed,Occupation_Unknown,Location_Suburban,Location_Urban,Policy Type_Comprehensive,Policy Type_Premium,Smoking Status_Yes,Property Type_Condo,Property Type_House
0,19.0,10049.0,1.0,1,22.598761,2.0,17.0,372.0,5.0,0,...,True,False,False,False,True,False,True,False,False,True
1,39.0,31678.0,3.0,2,15.569731,1.0,12.0,694.0,2.0,1,...,False,False,True,False,False,True,False,True,False,True
2,23.0,25602.0,3.0,0,47.177549,1.0,14.0,592.92435,3.0,2,...,True,False,False,True,False,False,True,True,False,True
3,21.0,141855.0,2.0,1,10.938144,1.0,0.0,367.0,1.0,0,...,False,False,True,False,False,False,False,True,False,False
4,21.0,39651.0,1.0,1,20.376094,0.0,8.0,598.0,4.0,0,...,True,False,False,False,False,False,True,True,False,True


In [2]:
# 훈련데이터에서 검증데이터셋 분리
from sklearn.model_selection import train_test_split

X = train_df.drop('Premium Amount', axis=1)
y = train_df['Premium Amount']

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((960000, 26), (240000, 26), (960000,), (240000,))

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# X_train을 기준으로 학습하고 변환
X_train_scaled = scaler.fit_transform(X_train)
# X_val은 변환만 (절대 fit 금지!)
X_val_scaled = scaler.transform(X_val)

# 모델 재학습
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# 예측 및 평가
y_pred = model.predict(X_val_scaled)

In [4]:
# y와 각 X 변수들 간의 상관계수 출력
train_df.corr()['Premium Amount'].sort_values(ascending=False)

Premium Amount               1.000000
Previous Claims              0.043885
Health Score                 0.014326
Policy Month                 0.006702
Marital Status_Single        0.003682
Occupation_Self-Employed     0.002608
Occupation_Unemployed        0.001787
Location_Urban               0.000952
Vehicle Age                  0.000391
Exercise Frequency           0.000378
Smoking Status_Yes           0.000163
Gender_Male                  0.000161
Policy Type_Comprehensive    0.000054
Insurance Duration          -0.000028
Location_Suburban           -0.000068
Property Type_Condo         -0.000508
Policy Type_Premium         -0.000792
Property Type_House         -0.000804
Number of Dependents        -0.000928
Customer Feedback           -0.001146
Education Level             -0.001654
Marital Status_Married      -0.002184
Age                         -0.002411
Occupation_Unknown          -0.006954
Policy Year                 -0.011084
Annual Income               -0.012091
Credit Score

In [5]:
# 모델 훈련
model = LinearRegression()
model.fit(X_train, y_train)

print('가중치:', model.coef_)
print('편향:', model.intercept_)

가중치: [-1.46259518e-01 -5.21458244e-04 -4.61574143e-01 -1.97423837e+00
  1.19062906e+00  4.19343239e+01 -4.16257448e-02 -1.80753584e-01
 -1.54519866e-01 -1.07908502e+00  4.04914005e-01 -6.10960342e+00
  1.08179885e+00  2.27341671e-01  1.35545886e+00  7.80379278e+00
 -6.10976159e-01 -1.94969546e+00 -1.32809315e+01  3.96368779e-01
  8.36728566e-01  1.73016009e+00  6.89770143e-01 -1.70925690e-01
 -1.40678136e+00 -2.94242764e+00]
편향: 13524.43778152622


In [6]:
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val,y_pred)
# rmse = root_mean_squared_error(y_val, y_pred)

print(f"MAE: {mae}")
# print(f"RMSE: {rmse}")

MAE: 667.0334591564268


In [7]:
from sklearn.metrics import r2_score

# 사용법: r2_score(실제 정답, 모델 예측값)
r2 = r2_score(y_val, y_pred)
print(f"R2 Score: {r2}")

R2 Score: 0.003285426250815293


In [8]:
print("Train Mean:", y_train.mean())
print("Test Mean:", y_val.mean())
print("Train Std:", y_train.std())
print("Test Std:", y_val.std())

Train Mean: 1102.5055291666667
Test Mean: 1102.7019916666666
Train Std: 865.1343121436161
Test Std: 864.4586166076839


In [9]:
# 스케일링 + 선형회귀 + 평가를 한 번에 실행
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model_scaled = LinearRegression()
model_scaled.fit(X_train_scaled, y_train)

y_pred_scaled = model_scaled.predict(X_val_scaled)
print("Scaled Linear R2:", r2_score(y_val, y_pred_scaled))

Scaled Linear R2: 0.003285426250815293
