# Support vector regressi


https://www.saedsayad.com/support_vector_machine_reg.htm

In [1]:
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt

### A simple nonlinear function
The aim is to create some synthetic data which is not very amenable for linear regression models. We will show how a Support Vector regressor enhances the predictive performance.

### Generate features and target data for regression

In [2]:
df = pd.read_csv("out1.csv")
df.head()
df["coords"]=list(zip(df["long"], df["lat"]))
df.pivot_table(index="coords",
               columns="band",
               values = "map_rescaled").head()


band,02,03,04,05,06,07,08,11,12,8A
coords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"(-5.54839898, 42.37787055)",0.1845,0.2251,0.2486,0.2987,0.3717,0.3943,0.403,0.3723,0.3017,0.4088
"(-5.54799999, 42.37965659)",0.1978,0.2343,0.2706,0.3143,0.3766,0.3964,0.4002,0.4084,0.3445,0.4048
"(-5.54669242, 42.40362733)",0.1854,0.2335,0.2539,0.3159,0.4198,0.4494,0.4428,0.395,0.3251,0.4586
"(-5.54636819, 42.40427712)",0.1808,0.217,0.2383,0.2893,0.3858,0.4116,0.3833,0.3671,0.2912,0.4241
"(-5.54035797, 42.42232626)",0.18,0.2242,0.2545,0.3083,0.3921,0.4134,0.41,0.411,0.3289,0.4312


In [None]:
df1 = pd.read_csv("out1.csv")
df1.head()
df1["coords"]=list(zip(df["long"], df["lat"]))
df1.pivot_table(index="coords",
               columns="band",
               values = "read").head()

In [None]:
import numpy as np

yp = df1.pivot_table(index="coords",
               columns="band",
               values = "read",aggfunc=np.mean).values
yp

In [None]:
import numpy as np

X = df.pivot_table(index="coords",
               columns="band",
               values = "map_rescaled",aggfunc=np.mean).values
X

In [None]:
X.shape

In [None]:
y = df.groupby("coords").mean()["read"].values


In [None]:
y.shape

In [None]:
y[:10]

In [None]:
min(y)

In [None]:
max(y)

### Plotting the data

### Test/train split

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model

lr = linear_model.LinearRegression()
y_pred = cross_val_predict(lr, X, y, cv=10)

In [None]:
resultados = pd.DataFrame({"Y_real":y,"Y_pred":y_pred})
resultados.plot(kind="scatter",x="Y_real",y="Y_pred")

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
y_pred = cross_val_predict(tree, X, y, cv=10)

In [None]:
resultados = pd.DataFrame({"Y_real":y,"Y_pred":y_pred})
resultados.plot(kind="scatter",x="Y_real",y="Y_pred")

In [None]:
'''mostrar coeficiente de correlacion'''

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Support vector regressor with linear kernel

Here is scikit-learn's SVR doc: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [None]:
from sklearn.svm import SVR
svr_linear = SVR(kernel='linear',gamma='scale', C=1.0, epsilon=0.1)
svr_linear.fit(X_train, y_train) 

### Test score

In [None]:
svr_linear.score(X_test,y_test)

### Linear regression as a baseline

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear = LinearRegression()

In [None]:
linear.fit(X_train,y_train)

In [None]:
linear.score(X_test,y_test)

### Support vector regressor with Gaussian (radial basis function) kernel

In [None]:
svr_rbf = SVR(kernel='rbf',gamma='scale', C=1.0, epsilon=0.1)
svr_rbf.fit(X_train, y_train) 

In [None]:
svr_rbf.score(X_test,y_test)

So, clearly, the RBF kernel showed better accuracy on the test set

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
print("RMSE for linear SVR:",np.sqrt(mean_squared_error(y_test,svr_linear.predict(X_test))))
print("RMSE for RBF kernelized SVR:",np.sqrt(mean_squared_error(y_test,svr_rbf.predict(X_test))))

### We can do a grid search of hyperparameters (with 5-fold cross-validation) to see if the test/validation score be improved

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'C':np.logspace(-1,3,15),'gamma': np.linspace(0.1, 0.9, 9)} 
params

In [None]:
parameters=[{'C': [0.1, 1,10,100,1000], 'kernel':['rbf','linear'],'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]


In [None]:
C=np.logspace(-1,3,15)
gamma=np.linspace(0.1, 0.9, 9)
gamma

In [None]:
grid = GridSearchCV(svr_rbf,param_grid=params,cv=5,scoring='r2',verbose=1,return_train_score=True)

In [None]:
grid.fit(X_train,y_train)

### Check which was deemed best estimator by the grid search

In [None]:
grid.best_estimator_

### Fit that estimator to the data and see

In [None]:
svr_best=SVR(kernel='rbf',gamma='scale', C=5.0, epsilon=1)
svr_best.fit(X_train, y_train)

In [None]:
svr_best.score(X_test,y_test)

In [None]:
print("RMSE for RBF kernelized SVR:",np.sqrt(mean_squared_error(y_test,svr_best.predict(X_test))))

In [None]:
'''X_grid = np.arange(min(X), max(X), 0.01) #this step required because data is feature scaled.
X_grid = X_grid.reshape((len(X_grid), 1))'''
plt.scatter(X, y, color = 'red')
plt.plot(svr_best.predict(X), color = 'blue')
plt.title('Scatter plot(SVR RBF)')
plt.xlabel('Reflectancia')
plt.ylabel('CoC')
plt.show()

### Support vector regressor with polynomial (radial basis function) kernel

In [None]:
svr_poly = SVR(kernel='poly',gamma='scale', C=1.0, degree=2)
svr_poly.fit(X_train, y_train) 

In [None]:
svr_poly.score(X_test,y_test)

So, no se

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
print("RMSE for linear SVR:",np.sqrt(mean_squared_error(y_test,svr_linear.predict(X_test))))
print("RMSE for RBF kernelized SVR:",np.sqrt(mean_squared_error(y_test,svr_rbf.predict(X_test))))
print("RMSE for RBF kernelized poly:",np.sqrt(mean_squared_error(y_test,svr_poly.predict(X_test))))

### We can do a grid search of hyperparameters (with 5-fold cross-validation) to see if the test/validation score be improved

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'C':[0.01,0.05,0.1,0.5,1,2,5],'degree':[2,3,4,5]}

In [None]:
gridp = GridSearchCV(svr_poly,param_grid=params,cv=5,scoring='r2',verbose=1,return_train_score=True)

In [None]:
gridp.fit(X_train,y_train)

### Check which was deemed best estimator by the grid search

In [None]:
gridp.best_estimator_

### Fit that estimator to the data and see

In [None]:
svrp_best=SVR(kernel='poly',gamma='scale', C=1, degree=2)
svrp_best.fit(X_train, y_train)

In [None]:
svrp_best.score(X_test,y_test)

In [None]:
print("RMSE for PoLY kernelized SVR:",np.sqrt(mean_squared_error(y_test,svrp_best.predict(X_test))))

In [None]:
'''X_grid = np.arange(min(X), max(X), 0.01) #this step required because data is feature scaled.
X_grid = X_grid.reshape((len(X_grid), 1))'''
plt.scatter(X, y, color = 'red')
plt.plot(svrp_best.predict(X), color = 'blue')
plt.title('Scatter plot(SVR poly)')
plt.xlabel('Reflectancia')
plt.ylabel('CoC')
plt.show()