# Support Vector Regression (SVR)

### Step 1 - Import libraries

In [1]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Sklearn
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization

# Other utilities
import sys
import os
import getpass
import chart_studio
import chart_studio.plotly as py

# Assign main directory to a variable
main_dir=os.path.dirname(sys.path[0])
print(main_dir)

/Users/solclover/Documents/Python/SAS_to_Python/015_ML_SVR


### Step 2 - Get the data for our model (from Kaggle)

For this analysis, we will use Kaggle house price data which you can download from here https://www.kaggle.com/quantbruce/real-estate-price-prediction?select=Real+estate.csv

After downloading it, we read csv into a Pandas dataframe

In [12]:
df = pd.read_csv(main_dir+'/data/Real estate.csv', encoding='utf-8')

# Use MinMax scaling on X2 and X3
scaler=MinMaxScaler()
df['X2 house age (scaled)']=scaler.fit_transform(df[['X2 house age']])
df['X3 distance to the nearest MRT station (scaled)']=scaler.fit_transform(df[['X3 distance to the nearest MRT station']])
df

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area,X2 house age (scaled),X3 distance to the nearest MRT station (scaled)
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9,0.730594,0.009513
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2,0.445205,0.043809
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3,0.303653,0.083315
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8,0.303653,0.083315
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1,0.114155,0.056799
...,...,...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4,0.312785,0.627820
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0,0.127854,0.010375
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6,0.429224,0.056861
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5,0.184932,0.012596


### Step 3 - Linear Regression vs. SVR (1 independent variable)

##### Let's create a scatter plot showing the distance from nearest MRT station (independent variable) and house price per unit area (dependent a.k.a. target variable)

In [13]:
# Create a scatter plot
fig = px.scatter(df, x=df['X3 distance to the nearest MRT station'], y=df['Y house price of unit area'], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="House Price Based on Distance from the Nearest MRT", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

##### Select features and model parameters. Then train the models and predict results

In [14]:
# ------- Select variables -------
# Note, we need X to be a 2D array, hence reshape
X=df['X3 distance to the nearest MRT station'].values.reshape(-1,1)
y=df['Y house price of unit area'].values

# ------- Linear regression -------
model1 = LinearRegression()
lr = model1.fit(X, y)

# ------- Support Vector regression -------
model2 = SVR(kernel='rbf', C=1, epsilon=10) # set kernel and hyperparameters
svr = model2.fit(X, y)

# ------- Predict a range of values based on the models for visualization -------
# Create 100 evenly spaced points from smallest X to largest X
x_range = np.linspace(X.min(), X.max(), 100)

# Predict y values for our set of X values
y_lr = model1.predict(x_range.reshape(-1, 1)) # Linear regression
y_svr = model2.predict(x_range.reshape(-1, 1)) # SVR

##### Print slope and intercept for the LR model and a summary for SVR model

In [16]:
print("Simple Linear Regression Model")
print("--------------------------------------")
print("Intercept: ", lr.intercept_)
print("Slope: ", lr.coef_)
print("")
print("<><><><><><><><><><><><><><><><><><><>")
print("")
print("SVR Model")
print("--------------------------------------")
print("Support: ", svr.support_)
print("Support Vectors: ", svr.support_vectors_)
print("Dual Coefficient: ", svr.dual_coef_)
#print("Coefficient: ", svr.coef_) # only available for linear kernel
print("Fit Status: ", svr.fit_status_)
print("Intercept: ", svr.intercept_)

Simple Linear Regression Model
--------------------------------------
Intercept:  45.851427057774984
Slope:  [-0.00726205]

<><><><><><><><><><><><><><><><><><><>

SVR Model
--------------------------------------
Support:  [  3  11  16  26  27  29  31  44  47  48  49  55  61  63  64  67  70  74
  78  83  88  91  93  96  99 101 104 105 113 116 117 119 120 122 124 126
 127 128 135 148 156 160 162 166 170 171 172 181 199 213 220 223 226 228
 229 231 233 236 247 248 249 250 251 255 256 258 269 270 271 273 285 286
 287 292 297 305 309 312 319 322 326 330 334 337 344 347 355 361 377 379
 384 389 402 409 413]
Support Vectors:  [[ 561.9845 ]
 [  90.45606]
 [ 292.9978 ]
 [ 383.8624 ]
 [ 276.449  ]
 [ 451.2438 ]
 [ 769.4034 ]
 [ 533.4762 ]
 [ 640.7391 ]
 [4605.749  ]
 [4510.359  ]
 [1160.632  ]
 [ 259.6607 ]
 [ 533.4762 ]
 [ 995.7554 ]
 [ 104.8101 ]
 [  90.45606]
 [ 379.5575 ]
 [ 552.4371 ]
 [2707.392  ]
 [1406.43   ]
 [1402.016  ]
 [1146.329  ]
 [  90.45606]
 [  90.45606]
 [ 170.1289 ]
 [ 392.4

##### Visualize the two models

In [15]:
# Create a scatter plot
fig = px.scatter(df, x=df['X3 distance to the nearest MRT station'], y=df['Y house price of unit area'], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Add a best-fit line
fig.add_traces(go.Scatter(x=x_range, y=y_lr, name='Linear Regression', line=dict(color='limegreen')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr, name='Support Vector Regression', line=dict(color='red')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr+10, name='+epsilon', line=dict(color='red', dash='dot')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr-10, name='-epsilon', line=dict(color='red', dash='dot')))

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="House Price Based on Distance from the Nearest MRT with Model Predictions (epsilon=10, C=1)", 
                             font=dict(color='black')))
# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

### Recreate the same with epsilon=10 and C=1000

In [17]:
model1 = LinearRegression()
model2 = SVR(kernel='rbf', C=1000, epsilon=10)

lr = model1.fit(X, y)
svr = model2.fit(X, y)

# Create 20 evenly spaced points from smallest X to largest X
x_range = np.linspace(X.min(), X.max(), 100) 

# Predict y values for our set of X values
y_lr = model1.predict(x_range.reshape(-1, 1))
y_svr = model2.predict(x_range.reshape(-1, 1))

In [18]:
# Create a scatter plot
fig = px.scatter(df, x=df['X3 distance to the nearest MRT station'], y=df['Y house price of unit area'], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Add a best-fit line
fig.add_traces(go.Scatter(x=x_range, y=y_lr, name='Linear Regression', line=dict(color='limegreen')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr, name='Support Vector Regression', line=dict(color='red')))
#fig.add_traces(go.Scatter(x=x_range, y=y_svr+10, name='SVR +epsilon', line=dict(color='red', dash='dash')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr+10, name='+epsilon', line=dict(color='red', dash='dot')))
fig.add_traces(go.Scatter(x=x_range, y=y_svr-10, name='-epsilon', line=dict(color='red', dash='dot')))

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="House Price Based on Distance from the Nearest MRT with Model Predictions (epsilon=10, C=1000)", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

### Step 4 - Linear Regression vs. SVR (2 independent variables)

First, let's create a 3D scatter plot containing distance from MRT and house age (our two independent variables) and house price per unit area (our dependent a.k.a target variable)

In [48]:
# Create a 3D scatter plot
fig = px.scatter_3d(df, 
                    x=df['X3 distance to the nearest MRT station (scaled)'], 
                    y=df['X2 house age (scaled)'], 
                    z=df['Y house price of unit area'], 
                    opacity=0.8, color_discrete_sequence=['black'],
                    height=900, width=1000
                   )

# Set figure title
fig.update_layout(title_text="Scatter 3D Plot",
                  scene_camera_eye=dict(x=1.5, y=1.5, z=0.25), 
                  scene_camera_center=dict(x=0, y=0, z=-0.2),
                  scene = dict(xaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='lightgrey'),
                               yaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='lightgrey'
                                          ),
                               zaxis=dict(backgroundcolor='white',
                                          color='black', 
                                          gridcolor='lightgrey')))

# Update marker size
fig.update_traces(marker=dict(size=2))
fig.show()

# Export chart to HTML file
#fig.write_html('Scatter_3D.html')

In [44]:
# ----------- Select variables -----------
X=df[['X3 distance to the nearest MRT station (scaled)','X2 house age (scaled)']]
y=df['Y house price of unit area'].values

# ----------- Model fitting -----------
# Define models and set hyperparameter values
model1 = LinearRegression()
model2 = SVR(kernel='rbf', C=100, epsilon=1)

# Fit the two models 
lr = model1.fit(X, y)
svr = model2.fit(X, y)

# ----------- For creating a prediciton plane to be used in the visualization -----------
# Set Increments between points in a meshgrid
mesh_size = 0.05

# Identify min and max values for input variables
x_min, x_max = X['X3 distance to the nearest MRT station (scaled)'].min(), X['X3 distance to the nearest MRT station (scaled)'].max()
y_min, y_max = X['X2 house age (scaled)'].min(), X['X2 house age (scaled)'].max()

# Return evenly spaced values based on a range between min and max
xrange = np.arange(x_min, x_max, mesh_size)
yrange = np.arange(y_min, y_max, mesh_size)

# Create a meshgrid
xx, yy = np.meshgrid(xrange, yrange)

# ----------- Create a prediciton plane  -----------
# Use models to create a prediciton plane --- Linear Regression
pred_LR = model1.predict(np.c_[xx.ravel(), yy.ravel()])
pred_LR = pred_LR.reshape(xx.shape)

# Use models to create a prediciton plane --- SVR
pred_svr = model2.predict(np.c_[xx.ravel(), yy.ravel()])
pred_svr = pred_svr.reshape(xx.shape)

# Note, .ravel() flattens the array to a 1D array,
# then np.c_ takes elements from flattened xx and yy arrays and puts them together,
# this creates the right shape required for model input

# prediction array that is created by the model output is a 1D array,
# Hence, we need to reshape it to be the same shape as xx or yy to be able to display it on a graph


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but SVR was fitted with feature names



##### Print results for Linear Regression

In [61]:
# Create a 3D scatter plot with predictions
fig = px.scatter_3d(df, x=df['X3 distance to the nearest MRT station (scaled)'], y=df['X2 house age (scaled)'], z=df['Y house price of unit area'], 
                    opacity=0.8, color_discrete_sequence=['black'],
                    width=1000, height=900
                   )

# Set figure title and colors
fig.update_layout(title_text="Scatter 3D Plot with Linear Regression Prediction Surface",
                  scene_camera_eye=dict(x=1.5, y=1.5, z=0.25), 
                  scene_camera_center=dict(x=0, y=0, z=-0.2),
                  scene = dict(xaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         ),
                               yaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                          ),
                               zaxis=dict(backgroundcolor='white',
                                          color='black', 
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         )))
# Update marker size
fig.update_traces(marker=dict(size=2))

# Add prediction plane
fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred_LR, name='LR', 
                          colorscale=px.colors.sequential.Sunsetdark, reversescale=False, showscale=False))

fig.show()

In [60]:
# Create a 3D scatter plot with predictions
fig = px.scatter_3d(df, x=df['X3 distance to the nearest MRT station (scaled)'], y=df['X2 house age (scaled)'], z=df['Y house price of unit area'], 
                    opacity=0.8, color_discrete_sequence=['black'],
                    width=1000, height=900
                   )

# Set figure title and colors
fig.update_layout(title_text="Scatter 3D Plot with SVR Prediction Surface",
                  scene_camera_eye=dict(x=1.5, y=1.5, z=0.25), 
                  scene_camera_center=dict(x=0, y=0, z=-0.2),
                  scene = dict(xaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         ),
                               yaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                          ),
                               zaxis=dict(backgroundcolor='white',
                                          color='black', 
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         )))
# Update marker size
fig.update_traces(marker=dict(size=2))

# Add prediction plane
fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred_svr, name='SVR',
                          colorscale=px.colors.sequential.Sunsetdark, reversescale=False,
                          showscale=False))

fig.show()

### Step 5 - Creata chart to be embedded to my Medium story

In [39]:
# Input Username and Password for Chart-Studio
print('Input Chart-Studio Username: ')
CS_user = input()
print('Input Chart-Studio API Key: ')
CS_api = getpass.getpass()

# Set Chart-Studio Credentials
chart_studio.tools.set_credentials_file(username=CS_user, api_key=CS_api)

Input Chart-Studio Username: 
SolClover
Input Chart-Studio API Key: 
········


In [62]:
# Create a 3D scatter plot with predictions
fig = px.scatter_3d(df, x=df['X3 distance to the nearest MRT station (scaled)'], y=df['X2 house age (scaled)'], z=df['Y house price of unit area'], 
                    opacity=0.8, color_discrete_sequence=['black'],
                    width=900, height=900
                   )

# Set figure title and colors
fig.update_layout(#title_text="Scatter 3D Plot with SVR Prediction Surface",
                  scene_camera_eye=dict(x=1.5, y=1.5, z=0.25), 
                  scene_camera_center=dict(x=0, y=0, z=-0.2),
                  margin=dict(l=0, r=0, b=0, t=0),
                  scene = dict(xaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         ),
                               yaxis=dict(backgroundcolor='white',
                                          color='black',
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                          ),
                               zaxis=dict(backgroundcolor='white',
                                          color='black', 
                                          gridcolor='#f0f0f0',
                                          title_font=dict(size=10),
                                          tickfont=dict(size=10),
                                         )))
# Update marker size
fig.update_traces(marker=dict(size=2))

# Add prediction plane
fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred_svr, name='SVR',
                          colorscale=px.colors.sequential.Sunsetdark, reversescale=False,
                          showscale=False))

fig.show()

In [63]:
# Export chart to Plotly Chart Studio - too big, does not allow to upload
py.plot(fig, filename = 'Support Vector Regression', auto_open=True)

'https://plotly.com/~SolClover/96/'

# End of Program