# 선형회귀 (Linear Regression)
- 프리미어리그 데이터(득점, 실점 승정)
- 득점, 실점 데이터를 이용하여 승점을 예측하는 모델을 작성

In [3]:
# 주요 패키지 로드
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error

Unnamed: 0,name,gf,ga,points
0,Manchester City,106,27,100
1,Manchester United,68,28,81
2,Tottenham Hotspur,74,36,77
3,Liverpool,84,38,75
4,Chelsea,62,38,70
5,Arsenal,74,51,63
6,Burnley,36,39,54
7,Everton,44,58,49
8,Leicester City,56,60,47
9,Newcastle United,39,47,44


#### 분석절차
- 데이터 로드
- 데이터 전처리
- 모델적용
- 성능평가 : MAE

In [4]:
# 1. 데이터로드
df = pd.read_csv('./extrafiles/premierleague.csv')
df

Unnamed: 0,name,gf,ga,points
0,Manchester City,106,27,100
1,Manchester United,68,28,81
2,Tottenham Hotspur,74,36,77
3,Liverpool,84,38,75
4,Chelsea,62,38,70
5,Arsenal,74,51,63
6,Burnley,36,39,54
7,Everton,44,58,49
8,Leicester City,56,60,47
9,Newcastle United,39,47,44


In [5]:
# 기초통계량 확인
df.describe()

Unnamed: 0,gf,ga,points
count,20.0,20.0,20.0
mean,50.9,50.9,52.05
std,20.968397,12.468487,19.170907
min,28.0,27.0,31.0
25%,35.75,38.75,39.25
50%,44.5,55.5,44.0
75%,63.5,58.5,64.75
max,106.0,68.0,100.0


In [7]:
# 독립변수와 종속 변수 분리 + # Scale 이 동일하므로 표준화는 별도로 하지 않는다.
X = df[['gf', 'ga']]
y = df[['points']]
X, y

(     gf  ga
 0   106  27
 1    68  28
 2    74  36
 3    84  38
 4    62  38
 5    74  51
 6    36  39
 7    44  58
 8    56  60
 9    39  47
 10   45  55
 11   45  61
 12   48  68
 13   44  64
 14   34  54
 15   28  58
 16   37  56
 17   28  56
 18   35  68
 19   31  56,
     points
 0      100
 1       81
 2       77
 3       75
 4       70
 5       63
 6       54
 7       49
 8       47
 9       44
 10      44
 11      44
 12      42
 13      41
 14      40
 15      37
 16      36
 17      33
 18      33
 19      31)

In [12]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42)

In [14]:
# 모델적용
model = LinearRegression()
model.fit(X_train, y_train)
pred_train = model.predict(X_train)
print(pred_train)
model.score(X_train, y_train)

[[42.50369139]
 [80.11416679]
 [32.2177955 ]
 [40.44890421]
 [40.13547559]
 [75.07028654]
 [46.93267438]
 [36.72364065]
 [66.45486703]
 [40.2891999 ]
 [43.63015268]
 [45.99836849]
 [39.75116479]
 [49.72961205]]


0.9451791054756594

In [25]:
# 모델성능평가 (Mean Absolute Error)
# mae = MAE()
# MAE.transf

# [A], [B], [C]
print(pred_train)

# [A, B, C] 로 변환하는 함수 flatten()
print(pred_train.flatten())

# 평균절대오차
mae = mean_absolute_error(y_train, pred_train)
print("MAE : ", mae)

# RMSE (Root Mean Squared Error)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train, pred_train)
print("MSE : ", mse)
print("RMSE : ", np.sqrt(mse))

[[42.50369139]
 [80.11416679]
 [32.2177955 ]
 [40.44890421]
 [40.13547559]
 [75.07028654]
 [46.93267438]
 [36.72364065]
 [66.45486703]
 [40.2891999 ]
 [43.63015268]
 [45.99836849]
 [39.75116479]
 [49.72961205]]
[42.50369139 80.11416679 32.2177955  40.44890421 40.13547559 75.07028654
 46.93267438 36.72364065 66.45486703 40.2891999  43.63015268 45.99836849
 39.75116479 49.72961205]
MAE :  2.8882506449083793
MSE :  11.51798181894054
RMSE :  3.3938152305245697
