- 선형회귀 코드 구현(경사 하강법: Gradient Descent)
- 다중 선형회귀 Linear Regression with multiple variables (features, Xs)
- 피처 엔지니어링(Feature Engineering)
import numpy as np
# Gradient descent
# input (1,2,3,4,5,7) x
# output (1,2,3,4,6,8) y
x = [1,2,3,4,5,7]
y = [1,2,3,4,6,8]
learning_rate = 0.001
# plotting function
import matplotlib.pyplot as plt
def display_graph(x, y):
plt.plot(x, y, 'o')
# naming the x axis
plt.xlabel('x - axis')
# naming the y axis
plt.ylabel('y - axis')
display_graph(x, y)
def line_graph (g, b):
x = np.linspace(0, 10, 10)
y = g * x + b
plt.plot(x, y, alpha = 0.1)
# gradient descent function
def gradient_descent(x, y):
w = 0
b = 0
# y ? f(x): wx + b
for epoch in range (3000):
gradient_w_sum = 0
gradient_b_sum = 0
for i in range (len(x)):
y_hat = w * x [i] + b
# 기울기는 ob_f의 미분값
gradient_w = (y_hat - y [i]) * x[i]
gradient_w_sum = gradient_w_sum + gradient_w
gradient_b = ( y_hat - y[i])
gradient_b_sum = gradient_b_sum + gradient_b
# w = w - learning_rate * gradient
# 기울기를 구해서 업데이트
w = w - learning_rate * gradient_w_sum / len(x)
b = b - learning_rate * gradient_b_sum / len(x)
print (f"gradient : {w} y-intercept: {b}")
line_graph (w, b)
# 최적화된 w, b
return w, b
w, b = gradient_descent(x, y)
print (f"weight: {w}, bias: {b}")
# epoch 만큼 파라미터를 업데이트
import numpy as np
def gradient_descent(x, y, learning_rate, num_epoch):
num_samples = x.shape[0]
num_features = x.shape[1]
theta = np.zeros(num_features + 1) # Initializing weights with bias term
# Adding a column of 1s for the bias term in the input data
x = np.concatenate((np.ones((num_samples, 1)), x), axis=1)
#print ()
for epoch in range(num_epoch):
error = np.dot(x, theta) - y
print(f"epoch {epoch} error: {error}")
gradient = 1 / num_samples * np.dot(x.T, error)
theta -= learning_rate * gradient
line_graph (theta[1], theta[0])
#line_graph (w, b)
return theta
# Data preparation
x1 = [1,2,3,4,5,7]
y1 = [1,2,3,4,6,8]
x2 = np.array(x1)
x3 = x2.reshape((-1,1))
#x = np.array([[3], [4], [5], [6]]) # Input variable (only 1-dimensional for simplicity)
#y = np.array([6, 8, 10, 12]) # Output variable
x = x3
y = y1
plt.plot(x, y, 'o', color='red')
# Hyperparameter settings
learning_rate = 0.01
num_epoch = 1000
# Gradient descent execution
theta = gradient_descent(x, y, learning_rate, num_epoch)
print("Optimal weights (theta):", theta)
# Stochastic Gradient descent
# input (1,2,3,4,5,7) x
# output (1,2,3,4,6,8) y
x = [1,2,3,4,5,7]
y = [1,2,3,4,6,8]
learning_rate = 0.001
# plotting function
import matplotlib.pyplot as plt
def display_graph(x, y):
plt.plot(x, y, 'o',color='red')
# naming the x axis
plt.xlabel('x - axis')
# naming the y axis
plt.ylabel('y - axis')
display_graph(x, y)
def line_graph (g, b):
x = np.linspace(0, 10, 10)
y = g * x + b
plt.plot(x, y, alpha = 0.1)
# gradient descent function
def gradient_descent(x, y):
w = 0
b = 0
# y ? f(x): wx + b
for epoch in range (1000):
gradient_w_sum = 0
gradient_b_sum = 0
for i in range (len(x)):
y_hat = w * x [i] + b
# 기울기는 ob_f의 미분값
gradient_w = (y_hat - y [i]) * x [i]
#gradient_w_sum = gradient_w_sum + gradient_w
gradient_b = ( y_hat - y[i])
#gradient_b_sum = gradient_b_sum + gradient_b
# w = w - learning_rate * gradient
# 기울기를 구해서 업데이트
w = w - learning_rate * gradient_w
b = b - learning_rate * gradient_b
print (f"gradient : {w} y-intercept: {b}")
line_graph (w, b)
# 최적화된 w, b
return w, b
w, b = gradient_descent(x, y)
print (f"weight: {w}, bias: {b}")
# epoch * number of data 만큼 업데이트
LinearRegression 클래스는 scikit-learn의 linear_model 모듈에서 제공되며, fit 메소드를 사용하여 모델을 훈련하고, predict 메소드를 사용하여 새로운 데이터에 대한 예측을 수행합니다.
import numpy as np
from sklearn.linear_model import LinearRegression
# 독립 변수 x와 종속 변수 y를 정의
x = np.array([1,2,3,4,5,7]).reshape((-1, 1))
y = np.array([1,2,3,4,6,8])
# 선형 회귀 모델 생성
model = LinearRegression()
# 모델 훈련
model.fit(x, y)
# 회귀식 출력
print("회귀식: y = {:.2f}x + {:.2f}".format(model.coef_[0], model.intercept_))
# 예측
x_new = np.array([6]).reshape((-1, 1))
y_new = model.predict(x_new)
print("x가 6일 때 y 예측값: {:.2f}".format(y_new[0]))
결과값 회귀식: y = 1.20x + -0.40 x가 6일 때 y 예측값: 6.80
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
reshaped_arr = arr.reshape(-1, 1)
print(arr)
# 출력: [1 2 3 4 5]
print(reshaped_arr)
# 출력:
# [[1]
# [2]
# [3]
# [4]
# [5]]
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def display_graph(x, y):
plt.plot(x, y)
# naming the x axis
plt.xlabel('x - axis')
# naming the y axis
plt.ylabel('y - axis')
def gradient_descent(x, y, learning_rate, num_iterations):
num_samples = x.shape[0]
num_features = x.shape[1]
theta = np.zeros(num_features + 1) # Initializing weights with bias term
# Adding a column of 1s for the bias term in the input data
x = np.concatenate((np.ones((num_samples, 1)), x), axis=1)
epochs = []
cost = []
for epoch in range(num_epoch):
error = np.dot(x, theta) - y
gradient = 1 / num_samples * np.dot(x.T, error)
theta -= learning_rate * gradient
epochs.append(epoch)
cost.append(np.sum(error**2))
print(epoch, np.sum(error**2), error )
display_graph(epochs, cost)
plt.plot(epochs, cost)
# naming the x axis
plt.xlabel('# of epochs')
# naming the y axis
plt.ylabel('cost')
return theta, cost, epochs
# Data preparation
#x = np.array([[1, 3, 4], [1, 4, 6], [1, 5, 7], [1, 6, 9]]) # Input variable (3-dimensional values)
x = np.array([[3, 4], [4, 6], [5, 7], [6, 9]]) # Input variable (2-dimensional values)
y = np.array([6, 8, 10, 12]) # Output variable
# Hyperparameter settings
learning_rate = 0.001
num_epoch = 50
# Gradient descent execution
theta, cost, iterations = gradient_descent(x, y, learning_rate, num_epoch)
print("Optimal weights (theta, theta0, theta1, theta2):", theta)
#Creating a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
#111은 그려질 서브플롯의 위치로 3자리 정수이며, 111은 1, 1, 1과 동일
# Plotting the data points
ax.scatter(x[:, 0], x[:, 1], y, color='blue', label='Data Points')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Output Variable')
# Generating the regression plane
x1_range = np.linspace(np.min(x[:, 0]), np.max(x[:, 0]), 10)
x2_range = np.linspace(np.min(x[:, 1]), np.max(x[:, 1]), 10)
x1_values, x2_values = np.meshgrid(x1_range, x2_range)
y_values = theta[0] + theta[1] * x1_values + theta[2] * x2_values
ax.plot_surface(x1_values, x2_values, y_values, alpha=0.5, color='red', label='Regression Plane')
#ax.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')
#drive.mount('/content/drive', force_remount=True)
Mounted at /content/drive
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/advertising.csv')
df
TV | Radio | Newspaper | Sales | |
---|---|---|---|---|
0 | 230.1 | 37.8 | 69.2 | 22.1 |
1 | 44.5 | 39.3 | 45.1 | 10.4 |
2 | 17.2 | 45.9 | 69.3 | 12.0 |
3 | 151.5 | 41.3 | 58.5 | 16.5 |
4 | 180.8 | 10.8 | 58.4 | 17.9 |
... | ... | ... | ... | ... |
195 | 38.2 | 3.7 | 13.8 | 7.6 |
196 | 94.2 | 4.9 | 8.1 | 14.0 |
197 | 177.0 | 9.3 | 6.4 | 14.8 |
198 | 283.6 | 42.0 | 66.2 | 25.5 |
199 | 232.1 | 8.6 | 8.7 | 18.4 |
print(df.isna().sum()) # null data check
TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64
df.hist(bins=50)
x=df[['TV','Radio','Newspaper']]
y=df['Sales']
import seaborn as sns # it is used in advance types of graphs and charts
sns.heatmap(df.corr(),annot=True)
plt.show()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)
reg = LinearRegression()
reg.fit(x_train,y_train)
reg.score(x_test,y_test)*100
86.45053923967724
print("Cofficient",reg.coef_)
print("Intercept",reg.intercept_)
Cofficient [ 0.05368006 0.11152624 -0.00351166] Intercept 4.773205203269841
reg.score(x_train,y_train)*100
91.32130769530316
다중회귀에서 회귀계수의 크기가 변수의 중요도를 의미하는 것이 아님.회귀계수가 변수의 중요도를 의미하려면 데이터를 표준화하여(Z normalization) 표준화 회귀계수(beta 계수라고도 함)를 구하여 상대비교해야 함? 통계적 유의성 검증?
x_train[:5]
TV | Radio | Newspaper | |
---|---|---|---|
134 | 36.9 | 38.6 | 65.6 |
66 | 31.5 | 24.6 | 2.2 |
26 | 142.9 | 29.3 | 12.6 |
113 | 209.6 | 20.6 | 10.7 |
168 | 215.4 | 23.6 | 57.6 |
print(x_train['Radio'][:5])
print(x_train.iloc[:5, 1])
134 38.6 66 24.6 26 29.3 113 20.6 168 23.6 Name: Radio, dtype: float64 134 38.6 66 24.6 26 29.3 113 20.6 168 23.6 Name: Radio, dtype: float64
print(x_train.loc[1][:5])
TV 44.5 Radio 39.3 Newspaper 45.1 Name: 1, dtype: float64
reg_radio = LinearRegression()
x_train_radio = np.array(x_train['Radio']).reshape(-1, 1)
x_test_radio = np.array(x_test['Radio']).reshape(-1, 1)
#reg_radio.fit(x_train['Radio'],y_train)
reg_radio.fit(x_train_radio,y_train)
print(reg_radio.score(x_test_radio,y_test)*100)
print("Cofficient",reg_radio.coef_)
print("Intercept",reg_radio.intercept_)
plt.scatter(x_train_radio, y_train)
predicted_val = 0.12350772 * x_train_radio + 12.46701094292984
plt.plot(x_train_radio,predicted_val)
x.shape
(200, 3)
x_train.shape
(160, 3)
x_train['Radio'].shape
(160,)
x_train_radio = np.array(x_train['Radio']).reshape(160, 1)
reg_TV = LinearRegression()
x_train_TV = np.array(x_train['TV']).reshape(-1, 1)
x_test_TV = np.array(x_test['TV']).reshape(-1, 1)
reg_TV.fit(x_train_TV,y_train)
print(reg_TV.score(x_test_TV,y_test)*100)
print("Cofficient",reg_TV.coef_)
print("Intercept",reg_TV.intercept_)
plt.scatter(x_train_TV, y_train)
predicted_val_TV = 0.0544343 * x_train_TV + 7.162275968528906
plt.plot(x_train_TV,predicted_val_TV)
reg_Newspaper = LinearRegression()
x_train_Newspaper = np.array(x_train['Newspaper']).reshape(-1, 1)
x_test_Newspaper = np.array(x_test['Newspaper']).reshape(-1, 1)
reg_Newspaper.fit(x_train_Newspaper,y_train)
print(reg_Newspaper.score(x_test_Newspaper,y_test)*100)
print("Cofficient",reg_Newspaper.coef_)
print("Intercept",reg_Newspaper.intercept_)
-4.777401704458972 Cofficient [0.04469941] I ntercept 14.009027446152578
from sklearn.preprocessing import StandardScaler
reg_scaled = LinearRegression()
sc = StandardScaler()
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)
reg_scaled.fit(x_train_std,y_train)
print(reg_scaled.score(x_test_std, y_test)*100)
print("Cofficient",reg_scaled.coef_)
print("Intercept",reg_scaled.intercept_)
86.45053923967725 Cofficient [ 4.54624242 1.6383335 -0.07507122] Intercept 15.355625
-
데이터의 가치를 높이는 핵심적인 기술
-
데이터의 변수들을 변형하고 선택하며 새로운 변수들을 생성하는 일련의 과정
-
머신 러닝 알고리즘의 성능을 향상하기 위해 데이터를 변환하고 개선하는 프로세스
-
적절한 피처 선택과 처리 및 스케일링 기술을 선정하면 높은 예측 성능을 보이는 모델을 구현할 수 있음
- 데이터 수집 : 분석할 데이터를 선정하고 수집
- 피처 선택 (Feature selection) : 필요한 피처를 선택하고 불필요한 피처를 삭제
- 피처 생성 : 새로운 피처를 생성. e.g. 높이와 너비를 곱해 면적이라는 새로운 피처를 만듦
Feature extraction (추출) Feature transform (변형)
- 피처 스케일링 : 피처의 값의 범위를 조정하여 동일한 범위에서 분석
잠재적인 피처를 먼저 선택한 후, 그 중에서 유의한 피처를 추출
모든 피처를 사용한 모델을 만든 후, 유의미하지 않은 피처를 하나씩 제거하여 최적의 피처를 선택
모델을 사용하여 피처의 중요도를 계산하고, 중요도가 높은 순서대로 피처를 선택
-
LOG(로그) TRANSFORMATION, 제곱근 (루트)
-
Polynomial Expansion(다항식 확장, power transform) : [a,b]에 대해 최대 차수가 2차수로 주어지면 1, a, b, aa, ab, bb를 만든다
Skewed (한쪽으로 기울어진) distribution (분포)
수학 기반 지식이 약간 부족해서 다음 특강 전까지 학습할 것이다. 남은 방학 동안 ppt와 여러 자료를 참고하여 복습할 것이다.
ppt에 긴 쳅터 들이 있다. 부분을 파트를 나누어서 중간중간 쉬는 시간을 넣는 일정한 패턴이 있으면 좋겠다.
AI 공부하는 방향과 그에 들어가는 지식을 배워 도움이 되었다. 배웠던 수학 공식을, 분석보고서를 작성하여 수학 세특으로 제출할 것 이다.
특강 해주셔서 감사합니다. 다음 특강이 기다려지네요.