Skip to content

Latest commit

 

History

History
1070 lines (565 loc) · 624 KB

23.07.28.md

File metadata and controls

1070 lines (565 loc) · 624 KB

오늘 배운 것

  • 선형회귀 코드 구현(경사 하강법: Gradient Descent)
  • 다중 선형회귀 Linear Regression with multiple variables (features, Xs)
  • 피처 엔지니어링(Feature Engineering)

선형회귀 코드 구현 (경사 하강법: Gradient Descent)

import numpy as np

  

# Gradient descent

  

# input (1,2,3,4,5,7) x

# output (1,2,3,4,6,8) y

  

x = [1,2,3,4,5,7]

y = [1,2,3,4,6,8]

learning_rate = 0.001

# plotting function

  

import matplotlib.pyplot as plt

  
  

def display_graph(x, y):

plt.plot(x, y, 'o')

# naming the x axis

plt.xlabel('x - axis')

# naming the y axis

plt.ylabel('y - axis')

  

display_graph(x, y)

  

def line_graph (g, b):

x = np.linspace(0, 10, 10)

y = g * x + b

plt.plot(x, y, alpha = 0.1)

  
  
  

# gradient descent function

  

def gradient_descent(x, y):

  

w = 0

b = 0

  

# y ? f(x): wx + b

  

for epoch in range (3000):

  

gradient_w_sum = 0

gradient_b_sum = 0

  

for i in range (len(x)):

y_hat = w * x [i] + b

  

# 기울기는 ob_f의 미분값

gradient_w = (y_hat - y [i]) * x[i]

gradient_w_sum = gradient_w_sum + gradient_w

  

gradient_b = ( y_hat - y[i])

gradient_b_sum = gradient_b_sum + gradient_b

  

# w = w - learning_rate * gradient

# 기울기를 구해서 업데이트

  

w = w - learning_rate * gradient_w_sum / len(x)

b = b - learning_rate * gradient_b_sum / len(x)

print (f"gradient : {w} y-intercept: {b}")

line_graph (w, b)

  

# 최적화된 w, b

  

return w, b

  

w, b = gradient_descent(x, y)

  

print (f"weight: {w}, bias: {b}")

  
  

# epoch 만큼 파라미터를 업데이트

import numpy as np

  

def gradient_descent(x, y, learning_rate, num_epoch):

num_samples = x.shape[0]

num_features = x.shape[1]

theta = np.zeros(num_features + 1) # Initializing weights with bias term

  
  

# Adding a column of 1s for the bias term in the input data

x = np.concatenate((np.ones((num_samples, 1)), x), axis=1)

  

#print ()

  

for epoch in range(num_epoch):

error = np.dot(x, theta) - y

print(f"epoch {epoch} error: {error}")

  

gradient = 1 / num_samples * np.dot(x.T, error)

theta -= learning_rate * gradient

  

line_graph (theta[1], theta[0])

#line_graph (w, b)

  

return theta

  
  

# Data preparation

  

x1 = [1,2,3,4,5,7]

y1 = [1,2,3,4,6,8]

  

x2 = np.array(x1)

x3 = x2.reshape((-1,1))

  

#x = np.array([[3], [4], [5], [6]]) # Input variable (only 1-dimensional for simplicity)

#y = np.array([6, 8, 10, 12]) # Output variable

  

x = x3

y = y1

  

plt.plot(x, y, 'o', color='red')

  

# Hyperparameter settings

learning_rate = 0.01

num_epoch = 1000

  

# Gradient descent execution

theta = gradient_descent(x, y, learning_rate, num_epoch)

  

print("Optimal weights (theta):", theta)

# Stochastic Gradient descent

  

# input (1,2,3,4,5,7) x

# output (1,2,3,4,6,8) y

  

x = [1,2,3,4,5,7]

y = [1,2,3,4,6,8]

learning_rate = 0.001

# plotting function

  

import matplotlib.pyplot as plt

  
  

def display_graph(x, y):

plt.plot(x, y, 'o',color='red')

# naming the x axis

plt.xlabel('x - axis')

# naming the y axis

plt.ylabel('y - axis')

  

display_graph(x, y)

  

def line_graph (g, b):

x = np.linspace(0, 10, 10)

y = g * x + b

plt.plot(x, y, alpha = 0.1)

  
  
  

# gradient descent function

  

def gradient_descent(x, y):

  

w = 0

b = 0

  

# y ? f(x): wx + b

  

for epoch in range (1000):

  

gradient_w_sum = 0

gradient_b_sum = 0

  

for i in range (len(x)):

y_hat = w * x [i] + b

  

# 기울기는 ob_f의 미분값

gradient_w = (y_hat - y [i]) * x [i]

#gradient_w_sum = gradient_w_sum + gradient_w

  

gradient_b = ( y_hat - y[i])

#gradient_b_sum = gradient_b_sum + gradient_b

  

# w = w - learning_rate * gradient

# 기울기를 구해서 업데이트

  

w = w - learning_rate * gradient_w

b = b - learning_rate * gradient_b

  

print (f"gradient : {w} y-intercept: {b}")

line_graph (w, b)

  

# 최적화된 w, b

  

return w, b

  

w, b = gradient_descent(x, y)

  

print (f"weight: {w}, bias: {b}")

  
  

# epoch * number of data 만큼 업데이트

LinearRegression 클래스는 scikit-learn의 linear_model 모듈에서 제공되며, fit 메소드를 사용하여 모델을 훈련하고, predict 메소드를 사용하여 새로운 데이터에 대한 예측을 수행합니다.

import numpy as np

from sklearn.linear_model import LinearRegression

  

# 독립 변수 x와 종속 변수 y를 정의

x = np.array([1,2,3,4,5,7]).reshape((-1, 1))

y = np.array([1,2,3,4,6,8])

  

# 선형 회귀 모델 생성

model = LinearRegression()

  

# 모델 훈련

model.fit(x, y)

  

# 회귀식 출력

print("회귀식: y = {:.2f}x + {:.2f}".format(model.coef_[0], model.intercept_))

  

# 예측

x_new = np.array([6]).reshape((-1, 1))

y_new = model.predict(x_new)

print("x가 6일 때 y 예측값: {:.2f}".format(y_new[0]))

결과값 회귀식: y = 1.20x + -0.40 x가 6일 때 y 예측값: 6.80

import numpy as np

  

arr = np.array([1, 2, 3, 4, 5])

reshaped_arr = arr.reshape(-1, 1)

  

print(arr)

# 출력: [1 2 3 4 5]

  

print(reshaped_arr)

# 출력:

# [[1]

# [2]

# [3]

# [4]

# [5]]
import numpy as np

import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

  
  

def display_graph(x, y):

plt.plot(x, y)

# naming the x axis

plt.xlabel('x - axis')

# naming the y axis

plt.ylabel('y - axis')

  

def gradient_descent(x, y, learning_rate, num_iterations):

num_samples = x.shape[0]

num_features = x.shape[1]

theta = np.zeros(num_features + 1) # Initializing weights with bias term

  

# Adding a column of 1s for the bias term in the input data

x = np.concatenate((np.ones((num_samples, 1)), x), axis=1)

  

epochs = []

cost = []

for epoch in range(num_epoch):

error = np.dot(x, theta) - y

gradient = 1 / num_samples * np.dot(x.T, error)

theta -= learning_rate * gradient

epochs.append(epoch)

cost.append(np.sum(error**2))

  

print(epoch, np.sum(error**2), error )

  

display_graph(epochs, cost)

  

plt.plot(epochs, cost)

# naming the x axis

plt.xlabel('# of epochs')

# naming the y axis

plt.ylabel('cost')

  

return theta, cost, epochs

  

# Data preparation

#x = np.array([[1, 3, 4], [1, 4, 6], [1, 5, 7], [1, 6, 9]]) # Input variable (3-dimensional values)

x = np.array([[3, 4], [4, 6], [5, 7], [6, 9]]) # Input variable (2-dimensional values)

y = np.array([6, 8, 10, 12]) # Output variable

  

# Hyperparameter settings

learning_rate = 0.001

num_epoch = 50

  

# Gradient descent execution

theta, cost, iterations = gradient_descent(x, y, learning_rate, num_epoch)

print("Optimal weights (theta, theta0, theta1, theta2):", theta)

  

#Creating a 3D plot

fig = plt.figure()

ax = fig.add_subplot(111, projection='3d')

#111은 그려질 서브플롯의 위치로 3자리 정수이며, 111은 1, 1, 1과 동일

  
  

# Plotting the data points

ax.scatter(x[:, 0], x[:, 1], y, color='blue', label='Data Points')

ax.set_xlabel('Feature 1')

ax.set_ylabel('Feature 2')

ax.set_zlabel('Output Variable')

  

# Generating the regression plane

x1_range = np.linspace(np.min(x[:, 0]), np.max(x[:, 0]), 10)

x2_range = np.linspace(np.min(x[:, 1]), np.max(x[:, 1]), 10)

x1_values, x2_values = np.meshgrid(x1_range, x2_range)

y_values = theta[0] + theta[1] * x1_values + theta[2] * x2_values

ax.plot_surface(x1_values, x2_values, y_values, alpha=0.5, color='red', label='Regression Plane')

  

#ax.legend()

plt.show()

데이터

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

#drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive

df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/advertising.csv')

df
TV Radio Newspaper Sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 12.0
3 151.5 41.3 58.5 16.5
4 180.8 10.8 58.4 17.9
... ... ... ... ...
195 38.2 3.7 13.8 7.6
196 94.2 4.9 8.1 14.0
197 177.0 9.3 6.4 14.8
198 283.6 42.0 66.2 25.5
199 232.1 8.6 8.7 18.4
print(df.isna().sum()) # null data check

TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64

df.hist(bins=50)

x=df[['TV','Radio','Newspaper']]

y=df['Sales']
import seaborn as sns # it is used in advance types of graphs and charts
sns.heatmap(df.corr(),annot=True)
plt.show()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)
reg = LinearRegression()
reg.fit(x_train,y_train)
reg.score(x_test,y_test)*100

86.45053923967724

print("Cofficient",reg.coef_)
print("Intercept",reg.intercept_)

Cofficient [ 0.05368006 0.11152624 -0.00351166] Intercept 4.773205203269841

reg.score(x_train,y_train)*100

91.32130769530316

다중회귀에서 회귀계수의 크기가 변수의 중요도를 의미하는 것이 아님.회귀계수가 변수의 중요도를 의미하려면 데이터를 표준화하여(Z normalization) 표준화 회귀계수(beta 계수라고도 함)를 구하여 상대비교해야 함? 통계적 유의성 검증?

x_train[:5]
TV Radio Newspaper
134 36.9 38.6 65.6
66 31.5 24.6 2.2
26 142.9 29.3 12.6
113 209.6 20.6 10.7
168 215.4 23.6 57.6
print(x_train['Radio'][:5])

print(x_train.iloc[:5, 1])

134 38.6 66 24.6 26 29.3 113 20.6 168 23.6 Name: Radio, dtype: float64 134 38.6 66 24.6 26 29.3 113 20.6 168 23.6 Name: Radio, dtype: float64

print(x_train.loc[1][:5])

TV 44.5 Radio 39.3 Newspaper 45.1 Name: 1, dtype: float64

reg_radio = LinearRegression()

  

x_train_radio = np.array(x_train['Radio']).reshape(-1, 1)

x_test_radio = np.array(x_test['Radio']).reshape(-1, 1)

  

#reg_radio.fit(x_train['Radio'],y_train)

reg_radio.fit(x_train_radio,y_train)

print(reg_radio.score(x_test_radio,y_test)*100)

print("Cofficient",reg_radio.coef_)

print("Intercept",reg_radio.intercept_)

plt.scatter(x_train_radio, y_train)

predicted_val = 0.12350772 * x_train_radio + 12.46701094292984

plt.plot(x_train_radio,predicted_val)

x.shape

(200, 3)

x_train.shape

(160, 3)

x_train['Radio'].shape

(160,)

x_train_radio = np.array(x_train['Radio']).reshape(160, 1)
reg_TV = LinearRegression()

  

x_train_TV = np.array(x_train['TV']).reshape(-1, 1)

x_test_TV = np.array(x_test['TV']).reshape(-1, 1)

  

reg_TV.fit(x_train_TV,y_train)

print(reg_TV.score(x_test_TV,y_test)*100)

print("Cofficient",reg_TV.coef_)

print("Intercept",reg_TV.intercept_)

plt.scatter(x_train_TV, y_train)

predicted_val_TV = 0.0544343 * x_train_TV + 7.162275968528906

plt.plot(x_train_TV,predicted_val_TV)

reg_Newspaper = LinearRegression()

  

x_train_Newspaper = np.array(x_train['Newspaper']).reshape(-1, 1)

x_test_Newspaper = np.array(x_test['Newspaper']).reshape(-1, 1)

  

reg_Newspaper.fit(x_train_Newspaper,y_train)

print(reg_Newspaper.score(x_test_Newspaper,y_test)*100)

print("Cofficient",reg_Newspaper.coef_)

print("Intercept",reg_Newspaper.intercept_)

-4.777401704458972 Cofficient [0.04469941] I ntercept 14.009027446152578

from sklearn.preprocessing import StandardScaler

  

reg_scaled = LinearRegression()

  

sc = StandardScaler()

sc.fit(x_train)

x_train_std = sc.transform(x_train)

x_test_std = sc.transform(x_test)

  

reg_scaled.fit(x_train_std,y_train)

print(reg_scaled.score(x_test_std, y_test)*100)

print("Cofficient",reg_scaled.coef_)

print("Intercept",reg_scaled.intercept_)

86.45053923967725 Cofficient [ 4.54624242 1.6383335 -0.07507122] Intercept 15.355625

Feature engineering

  • 데이터의 가치를 높이는 핵심적인 기술

  • 데이터의 변수들을 변형하고 선택하며 새로운 변수들을 생성하는 일련의 과정

  • 머신 러닝 알고리즘의 성능을 향상하기 위해 데이터를 변환하고 개선하는 프로세스

  • 적절한 피처 선택과 처리 및 스케일링 기술을 선정하면 높은 예측 성능을 보이는 모델을 구현할 수 있음

피처 엔지니어링의 절차

  1. 데이터 수집 : 분석할 데이터를 선정하고 수집
  2. 피처 선택 (Feature selection) : 필요한 피처를 선택하고 불필요한 피처를 삭제
  3. 피처 생성 : 새로운 피처를 생성. e.g. 높이와 너비를 곱해 면적이라는 새로운 피처를 만듦

Feature extraction (추출) Feature transform (변형)

  1. 피처 스케일링 : 피처의 값의 범위를 조정하여 동일한 범위에서 분석

피처 선택 방법

  • 필터링(Filering) preencoded.png

잠재적인 피처를 먼저 선택한 후, 그 중에서 유의한 피처를 추출

  • 재귀적 특성 제거(Recursive Feature Elimination) preencoded.png

모든 피처를 사용한 모델을 만든 후, 유의미하지 않은 피처를 하나씩 제거하여 최적의 피처를 선택

  • 중요도 분석(Importance Analysis) preencoded.png

모델을 사용하여 피처의 중요도를 계산하고, 중요도가 높은 순서대로 피처를 선택

Feature selection

Feature selection

Feature extraction (추출)

Feature extraction

Feature Transform (변형)

  • LOG(로그) TRANSFORMATION, 제곱근 (루트) 

  • Polynomial Expansion(다항식 확장, power transform) : [a,b]에 대해 최대 차수가 2차수로 주어지면 1, a, b, aa, ab, bb를 만든다

Skewed (한쪽으로 기울어진) distribution (분포)

어려웠던 점

수학 기반 지식이 약간 부족해서 다음 특강 전까지 학습할 것이다. 남은 방학 동안 ppt와 여러 자료를 참고하여 복습할 것이다.

보안

ppt에 긴 쳅터 들이 있다. 부분을 파트를 나누어서 중간중간 쉬는 시간을 넣는 일정한 패턴이 있으면 좋겠다.

좋았던 점

AI 공부하는 방향과 그에 들어가는 지식을 배워 도움이 되었다. 배웠던 수학 공식을, 분석보고서를 작성하여 수학 세특으로 제출할 것 이다.

그 외 건의사항

특강 해주셔서 감사합니다. 다음 특강이 기다려지네요.