In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

import numpy as np
import pandas as pd

### 데이터 불러오기

In [4]:
admission_df = pd.read_csv('admission_data.csv')
admission_df = admission_df.iloc[:,1:]
# admission_df = pd.read_csv('admission_data.csv').drop('Serial No.', axis=1)
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


* GRE Score : GRE 점수
* TOEFL Score : TOEFL 점수 
* University Rating : 지원 대학 등급
* SOP : 학업 계획서 점수
* LOR : 추천서 점수 
* CGPA : 학점
* Research
* Chance of Admit : 합격 확률 - (output)

## 다항 회귀 적용

### 입력 변수

In [6]:
X = admission_df.drop(['Chance of Admit '], axis=1)

In [8]:
# 직선 대신, 6차항의 다항 회귀 모델 데이터 준비
polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values) 
# 변수 이름
features = polynomial_transformer.get_feature_names(X.columns)

In [10]:
X = pd.DataFrame(polynomial_features, columns = features)
X.head()

Unnamed: 0,1,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,GRE Score^2,GRE Score TOEFL Score,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,113569.0,39766.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,104976.0,34668.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,99856.0,32864.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,103684.0,35420.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,98596.0,32342.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


### 목표 변수

In [11]:
y = admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.3, random_state=5)

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [16]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [18]:
mse = mean_squared_error(y_train, y_train_predict)
print(f'training set 성능 : {sqrt(mse)}')
mse = mean_squared_error(y_test, y_test_predict)
print(f'test set 성능 : {sqrt(mse)}')

training set 성능 : 0.001504822162544905
test set 성능 : 5.090715894673793


#### training set에 `과적합` 되었음을 알 수 있음

---

## Regularization
#### 정규화 : 가설 함수의 theta 값들이 너무 커지는 것을 방지하여 과적합을 예방하는 방법

* training data가 최대한 많이 통과할 수 있도록 하기 위해 많은 굴곡을 이용하게 된다.
* 함수가 급격하게 변화한다. → 이유 : theta 값들이 크기 때문
* 정규화는 theta값들이 너무 커지는 것을 방지하는 방법
* training data에 대한 오차는 커질 수 있어도, 함수의 변동을 완만하게 만들 수 있다. 
* 이는 여러 데이터 셋에 대해 일관된 성능을 보이기 때문에, 과적합을 막을 수 있다. 

### L1 정규화 Lasso

In [21]:
### 모델 변경
model = Lasso(alpha =0.001,max_iter=1000, normalize =  True )
model.fit(X_train, y_train)

Lasso(alpha=0.001, normalize=True)

In [24]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

normalize을 학습시키기 전에 자동으로 인풋데이터들을 0과 1사이로 normalize

모델 성능 평가

In [25]:
mse = mean_squared_error(y_train, y_train_predict)
print(f'training set 성능 : {sqrt(mse)}')
mse = mean_squared_error(y_test, y_test_predict)
print(f'test set 성능 : {sqrt(mse)}')

training set 성능 : 0.06336620966147144
test set 성능 : 0.06007719092689258


---

### L2 정규화 Ridge

In [28]:
X = admission_df.drop(['Chance of Admit '], axis=1)
# 직선 대신, 6차항의 다항 회귀 모델 데이터 준비
polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values) 
# 변수 이름
features = polynomial_transformer.get_feature_names(X.columns)
X = pd.DataFrame(polynomial_features, columns = features)
y = admission_df[['Chance of Admit ']]

In [29]:
model = Ridge(alpha =0.001,max_iter=1000, normalize =  True )
model.fit(X_train, y_train)

Ridge(alpha=0.001, max_iter=1000, normalize=True)

In [30]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [31]:
mse = mean_squared_error(y_train, y_train_predict)
print(f'training set 성능 : {sqrt(mse)}')
mse = mean_squared_error(y_test, y_test_predict)
print(f'test set 성능 : {sqrt(mse)}')

training set 성능 : 0.05327825805894827
test set 성능 : 0.06669588064639442
