### Multivariate Linear Regression Task
- 구매자의 연수입(Annual Salary)과 순자산(자산 - 부채, Net Worth), 신용카드 부채(Credit Card Debt)를 통해 다변량 회귀 분석 진행

In [1]:
import pandas as pd

c_df = pd.read_csv('./datasets/car_purchasing.csv')
c_df

Unnamed: 0,Customer Name,Customer e-mail,Country,Gender,Age,Annual Salary,Credit Card Debt,Net Worth,Car Purchase Amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,USA,0,42,62812.09301,11609.380910,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,USA,0,41,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,USA,1,43,53798.55112,11160.355060,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,USA,1,58,79370.03798,14426.164850,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,USA,1,57,59729.15130,5358.712177,560304.0671,55915.46248
...,...,...,...,...,...,...,...,...,...
495,Walter,ligula@Cumsociis.ca,USA,0,41,71942.40291,6995.902524,541670.1016,48901.44342
496,Vanna,Cum.sociis.natoque@Sedmolestie.edu,USA,1,38,56039.49793,12301.456790,360419.0988,31491.41457
497,Pearl,penatibus.et@massanonante.com,USA,1,54,68888.77805,10611.606860,764531.3203,64147.28888
498,Nell,Quisque.varius@arcuVivamussit.net,USA,1,59,49811.99062,14013.034510,337826.6382,45442.15353


In [2]:
pre_c_df = c_df.drop(labels=['Customer Name', 'Customer e-mail', 'Country'], axis=1)
pre_c_df

Unnamed: 0,Gender,Age,Annual Salary,Credit Card Debt,Net Worth,Car Purchase Amount
0,0,42,62812.09301,11609.380910,238961.2505,35321.45877
1,0,41,66646.89292,9572.957136,530973.9078,45115.52566
2,1,43,53798.55112,11160.355060,638467.1773,42925.70921
3,1,58,79370.03798,14426.164850,548599.0524,67422.36313
4,1,57,59729.15130,5358.712177,560304.0671,55915.46248
...,...,...,...,...,...,...
495,0,41,71942.40291,6995.902524,541670.1016,48901.44342
496,1,38,56039.49793,12301.456790,360419.0988,31491.41457
497,1,54,68888.77805,10611.606860,764531.3203,64147.28888
498,1,59,49811.99062,14013.034510,337826.6382,45442.15353


In [3]:
pre_c_df.corr()['Car Purchase Amount'].sort_values(ascending=False)[1:]

Age                 0.633273
Annual Salary       0.617862
Net Worth           0.488580
Credit Card Debt    0.028882
Gender             -0.066408
Name: Car Purchase Amount, dtype: float64

In [4]:
pre_c_df = pre_c_df.drop(labels=['Gender'], axis=1)
pre_c_df

Unnamed: 0,Age,Annual Salary,Credit Card Debt,Net Worth,Car Purchase Amount
0,42,62812.09301,11609.380910,238961.2505,35321.45877
1,41,66646.89292,9572.957136,530973.9078,45115.52566
2,43,53798.55112,11160.355060,638467.1773,42925.70921
3,58,79370.03798,14426.164850,548599.0524,67422.36313
4,57,59729.15130,5358.712177,560304.0671,55915.46248
...,...,...,...,...,...
495,41,71942.40291,6995.902524,541670.1016,48901.44342
496,38,56039.49793,12301.456790,360419.0988,31491.41457
497,54,68888.77805,10611.606860,764531.3203,64147.28888
498,59,49811.99062,14013.034510,337826.6382,45442.15353


In [20]:
import torch
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train1 = torch.FloatTensor(X_train.Age.values).view(-1, 1)
X_train2 = torch.FloatTensor(X_train['Annual Salary'].values).view(-1, 1)
X_train3 = torch.FloatTensor(X_train['Credit Card Debt'].values).view(-1, 1)
X_train4 = torch.FloatTensor(X_train['Net Worth'].values).view(-1, 1)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test1 = torch.FloatTensor(X_test.Age.values).view(-1, 1)
X_test2 = torch.FloatTensor(X_test['Annual Salary'].values).view(-1, 1)
X_test3 = torch.FloatTensor(X_test['Credit Card Debt'].values).view(-1, 1)
X_test4 = torch.FloatTensor(X_test['Net Worth'].values).view(-1, 1)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

W1 = torch.zeros(1, requires_grad=True)
W2 = torch.zeros(1, requires_grad=True)
W3 = torch.zeros(1, requires_grad=True)
W4 = torch.zeros(1, requires_grad=True)
b = torch.zeros(1, requires_grad=True)

optimizer = SGD([W1, W2, W3, W4, b], lr=1e-12)

epochs = 100000

for epoch in range(1, epochs + 1):
    H = W1 * X_train1 + W2 * X_train2 + W3 * X_train3 + W4 * X_train4 + b
    loss = torch.mean((y_train - H) ** 2)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print('{:4d}/{}: W1: {:.4f}, W2: {:.4f}, W3: {:.4f}, W4: {:.4f}, b: {:.4f}, loss: {:.4f}'\
        .format(epoch, epochs, W1.item(), W2.item(), W3.item(), W4.item(), b.item(), loss.item()))

1000/100000: W1: 0.0003, W2: 0.3670, W3: 0.0487, W4: 0.0458, b: 0.0000, loss: 58160424.0000
2000/100000: W1: 0.0005, W2: 0.4668, W3: 0.0600, W4: 0.0331, b: 0.0000, loss: 46555268.0000
3000/100000: W1: 0.0006, W2: 0.4949, W3: 0.0615, W4: 0.0296, b: 0.0000, loss: 45642228.0000
4000/100000: W1: 0.0007, W2: 0.5030, W3: 0.0603, W4: 0.0286, b: 0.0000, loss: 45565744.0000
5000/100000: W1: 0.0008, W2: 0.5055, W3: 0.0583, W4: 0.0283, b: 0.0000, loss: 45554964.0000
6000/100000: W1: 0.0009, W2: 0.5064, W3: 0.0562, W4: 0.0283, b: 0.0000, loss: 45549588.0000
7000/100000: W1: 0.0010, W2: 0.5069, W3: 0.0541, W4: 0.0283, b: 0.0000, loss: 45544884.0000
8000/100000: W1: 0.0011, W2: 0.5072, W3: 0.0520, W4: 0.0283, b: 0.0000, loss: 45540476.0000
9000/100000: W1: 0.0012, W2: 0.5075, W3: 0.0500, W4: 0.0283, b: 0.0000, loss: 45536308.0000
10000/100000: W1: 0.0013, W2: 0.5078, W3: 0.0480, W4: 0.0283, b: 0.0000, loss: 45532364.0000
11000/100000: W1: 0.0013, W2: 0.5081, W3: 0.0461, W4: 0.0283, b: 0.0000, loss: 

In [23]:
H = 0.0099 * X_test1 + 0.5165 * X_test2 - 0.0146 * X_test3 + 0.0284 * X_test4
loss = torch.mean((y_test - H) ** 2)
print(loss.item())

44900548.0


In [22]:
import torch
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

W = torch.zeros((4, 1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

optimizer = SGD([W, b], lr=1e-12)

epochs = 100000

for epoch in range(1, epochs + 1):
    H = X_train.matmul(W) + b
    loss = torch.mean((y_train - H) ** 2)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print('{:4d}/{}: W1: {:.4f}, W2: {:.4f}, W3: {:.4f}, W4: {:.4f}, b: {:.4f}, loss: {:.4f}'\
        .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), W[3].item(), b.item(), loss.item()))

1000/100000: W1: 0.0003, W2: 0.3670, W3: 0.0487, W4: 0.0458, b: 0.0000, loss: 58160424.0000
2000/100000: W1: 0.0005, W2: 0.4668, W3: 0.0600, W4: 0.0331, b: 0.0000, loss: 46555276.0000
3000/100000: W1: 0.0006, W2: 0.4949, W3: 0.0615, W4: 0.0296, b: 0.0000, loss: 45642240.0000
4000/100000: W1: 0.0007, W2: 0.5030, W3: 0.0603, W4: 0.0286, b: 0.0000, loss: 45565748.0000
5000/100000: W1: 0.0008, W2: 0.5055, W3: 0.0583, W4: 0.0283, b: 0.0000, loss: 45554964.0000
6000/100000: W1: 0.0009, W2: 0.5064, W3: 0.0562, W4: 0.0283, b: 0.0000, loss: 45549588.0000
7000/100000: W1: 0.0010, W2: 0.5069, W3: 0.0541, W4: 0.0283, b: 0.0000, loss: 45544884.0000
8000/100000: W1: 0.0011, W2: 0.5072, W3: 0.0520, W4: 0.0283, b: 0.0000, loss: 45540476.0000
9000/100000: W1: 0.0012, W2: 0.5075, W3: 0.0500, W4: 0.0283, b: 0.0000, loss: 45536308.0000
10000/100000: W1: 0.0013, W2: 0.5078, W3: 0.0480, W4: 0.0283, b: 0.0000, loss: 45532368.0000
11000/100000: W1: 0.0013, W2: 0.5081, W3: 0.0461, W4: 0.0283, b: 0.0000, loss: 

In [24]:
H = 0.0099 * X_test1 + 0.5165 * X_test2 - 0.0146 * X_test3 + 0.0284 * X_test4
loss = torch.mean((y_test - H) ** 2)
print(loss.item())

44900548.0


In [35]:
import torch
from torch.nn import Linear
from torch.nn.functional import mse_loss
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

l_r = Linear(4, 1)

optimizer = SGD(l_r.parameters(), lr=1e-12)

epochs = 100000

for epoch in range(1, epochs + 1):
    H = l_r(X_train)
    loss = mse_loss(y_train, H)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print('{:4d}/{}: '\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), W[3].item(), b.item(), loss.item()), end='')
        for i, w in enumerate(list(l_r.parameters())[0][0]):
            print('W{}: {:.4f}, '\
                  .format(i + 1, w.item()), end='')
        print('b: {:.4f}, loss: {:.4f}'.format(list(l_r.parameters())[1].item(), loss.item()))

1000/100000: W1: -0.3472, W2: 0.3735, W3: 0.3629, W4: 0.0390, b: 0.0123, loss: 52993284.0000
2000/100000: W1: -0.3470, W2: 0.4391, W3: 0.3607, W4: 0.0309, b: 0.0123, loss: 48050136.0000
3000/100000: W1: -0.3469, W2: 0.4585, W3: 0.3524, W4: 0.0286, b: 0.0123, loss: 47554508.0000
4000/100000: W1: -0.3468, W2: 0.4649, W3: 0.3426, W4: 0.0280, b: 0.0123, loss: 47412760.0000
5000/100000: W1: -0.3468, W2: 0.4678, W3: 0.3326, W4: 0.0278, b: 0.0123, loss: 47304416.0000
6000/100000: W1: -0.3467, W2: 0.4695, W3: 0.3228, W4: 0.0278, b: 0.0123, loss: 47204076.0000
7000/100000: W1: -0.3466, W2: 0.4710, W3: 0.3132, W4: 0.0278, b: 0.0123, loss: 47109452.0000
8000/100000: W1: -0.3465, W2: 0.4723, W3: 0.3038, W4: 0.0278, b: 0.0123, loss: 47020100.0000
9000/100000: W1: -0.3464, W2: 0.4736, W3: 0.2947, W4: 0.0278, b: 0.0123, loss: 46935696.0000
10000/100000: W1: -0.3463, W2: 0.4748, W3: 0.2859, W4: 0.0278, b: 0.0123, loss: 46855988.0000
11000/100000: W1: -0.3462, W2: 0.4760, W3: 0.2773, W4: 0.0279, b: 0.0

In [36]:
H = -0.3383 * X_test1 + 0.5141 * X_test2 + 0.0039 * X_test3 + 0.0283 * X_test4 + 0.0123
loss = torch.mean((y_test - H) ** 2)
print(loss.item())

44949920.0


In [43]:
from torch.nn import Module, Linear

class LinearRegressionModel(Module):
    def __init__(self):
        super().__init__()
        self.linear = Linear(4, 1)

    def forward(self, x):
        return self.linear(x)

In [58]:
import torch
from torch.nn import Linear
from torch.nn.functional import mse_loss
from torch.optim import SGD
from sklearn.model_selection import train_test_split

torch.manual_seed(124)

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

X_train = torch.FloatTensor(X_train.values)
y_train = torch.FloatTensor(y_train.values).view(-1, 1)

X_test = torch.FloatTensor(X_test.values)
y_test = torch.FloatTensor(y_test.values).view(-1, 1)

l_r = LinearRegressionModel()

optimizer = SGD(l_r.parameters(), lr=1e-12)

epochs = 10000000

for epoch in range(1000000, epochs + 1):
    H = l_r(X_train)
    loss = mse_loss(y_train, H)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10000 == 0:
        print('{:4d}/{}: '\
              .format(epoch, epochs, W[0].item(), W[1].item(), W[2].item(), W[3].item(), b.item(), loss.item()), end='')
        for i, w in enumerate(list(l_r.parameters())[0][0]):
            print('W{}: {:.4f}, '\
                  .format(i + 1, w.item()), end='')
        print('b: {:.4f}, loss: {:.4f}'.format(list(l_r.parameters())[1].item(), loss.item()))

1000000/10000000: W1: -0.3474, W2: 0.1368, W3: 0.3415, W4: 0.0160, b: 0.0123, loss: 2113092608.0000
1001000/10000000: W1: -0.3472, W2: 0.3736, W3: 0.3629, W4: 0.0390, b: 0.0123, loss: 52979852.0000
1002000/10000000: W1: -0.3470, W2: 0.4391, W3: 0.3607, W4: 0.0309, b: 0.0123, loss: 48048976.0000
1003000/10000000: W1: -0.3469, W2: 0.4585, W3: 0.3524, W4: 0.0286, b: 0.0123, loss: 47554304.0000
1004000/10000000: W1: -0.3468, W2: 0.4649, W3: 0.3426, W4: 0.0280, b: 0.0123, loss: 47412644.0000
1005000/10000000: W1: -0.3468, W2: 0.4678, W3: 0.3326, W4: 0.0278, b: 0.0123, loss: 47304316.0000
1006000/10000000: W1: -0.3467, W2: 0.4695, W3: 0.3228, W4: 0.0278, b: 0.0123, loss: 47203972.0000
1007000/10000000: W1: -0.3466, W2: 0.4710, W3: 0.3132, W4: 0.0278, b: 0.0123, loss: 47109364.0000
1008000/10000000: W1: -0.3465, W2: 0.4723, W3: 0.3038, W4: 0.0278, b: 0.0123, loss: 47020012.0000
1009000/10000000: W1: -0.3464, W2: 0.4736, W3: 0.2947, W4: 0.0278, b: 0.0123, loss: 46935620.0000
1010000/10000000: 

In [45]:
H = -0.3383 * X_test1 + 0.5141 * X_test2 + 0.0039 * X_test3 + 0.0283 * X_test4 + 0.0123
loss = torch.mean((y_test - H) ** 2)
print(loss.item())

44949920.0


#### Sklearn

In [47]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

features, targets = pre_c_df.iloc[:, :-1], pre_c_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

l_r = LinearRegression()
l_r.fit(X_train, y_train)
# 가중치: l_r.coef_[0]
# 편향: l_r.intercept_
print('W: {:.4f}, b: {:.4f}'.format(l_r.coef_[0], l_r.intercept_))

W: 839.4740, b: -42002.8280


In [48]:
from sklearn.metrics import mean_squared_error

prediction = l_r.predict(X_test)
print('MSE loss: {:.4f}, RMSE loss: {:.4f}'\
      .format(mean_squared_error(y_test, prediction), 
      np.sqrt(mean_squared_error(y_test, prediction))))

MSE loss: 56987.2571, RMSE loss: 238.7200
