# Support Vector Regressor Model 

In this notebook, I trained a support vector machine on 2 different datasets. These datasets are made different due to their independent variables and the method of handling categorical columns.

In [3]:
import pandas as pd
from sklearn.svm import SVR
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

## First Dataset

In [15]:
# Load the data file into a DataFrame
df = pd.read_csv('../data/dataset_with_encoded_location.zip', compression='zip')
df.head()

Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,L1,L2,L3,L4,L5,...,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
0,2,3,62,3,1440,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,95,3,1521,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,51,2,1200,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,63,3,1310,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,70,3,1800,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X = df.drop('price', axis=1)
y= df['price']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [18]:
X_train.shape

(5596, 20)

### Iteration

In [8]:
reg = make_pipeline(StandardScaler(), SVR())

params = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': [0.01, 0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(reg, params, cv=5)

grid_search.fit(X_train, y_train)

In [9]:
# Best model parameters
grid_search.best_params_

{'svr__C': 100, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}

In [10]:
# Best estimator
best_model = grid_search.best_estimator_

In [11]:
# Evaluate model performance

predictions = best_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f'Mean Square Error: {mse:.5f}')

rmse = np.sqrt(mse)
print(f'Root Mean Square Error: {rmse:.5f}')

score = grid_search.best_score_
print(f'R_Squared Score: {score:.5f}')

Mean Square Error: 8.76006
Root Mean Square Error: 2.95974
R_Squared Score: 0.98935


## Second Dataset

In [14]:
# Load the data file into a DataFrame
data = pd.read_csv('../data/encoded_dataset.zip', compression='zip', index_col=0)
data.head()

Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,location_Bannerghatta Road,location_Electronic City,location_Haralur Road,location_Hebbal,location_Hennur Road,location_Kanakpura Road,location_Marathahalli,location_Other,location_Raja Rajeshwari Nagar,location_Sarjapur Road,location_Thanisandra,location_Uttarahalli,location_Whitefield,location_Yelahanka
2,2.0,3.0,62.0,3.0,1440.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,3.0,1.0,95.0,3.0,1521.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,2.0,1.0,51.0,2.0,1200.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8,3.0,1.0,63.25,3.0,1310.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
10,2.0,2.0,70.0,3.0,1800.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [19]:
# split into features and target
features = data.drop('price', axis=1)
target = data['price']

In [20]:
# split the data into training and testing set
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.2, random_state=42)

### Iterate

In [24]:
reg_2 = make_pipeline(StandardScaler(), SVR())

params = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': [0.01, 0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto']
}

grid_search_2 = GridSearchCV(reg_2, params, cv=5)

grid_search_2.fit(train_x, train_y)

In [25]:
# Best model parameters
grid_search_2.best_params_

{'svr__C': 100, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}

In [27]:
# Best estimator
best_estimator = grid_search_2.best_estimator_

In [30]:
# Evaluate model performance

pred_y = best_estimator.predict(test_x)

mse_2 = mean_squared_error(test_y, pred_y)
print(f'Mean Square Error: {mse_2:.5f}')

rmse_2 = np.sqrt(mse_2)
print(f'Root Mean Square Error: {rmse_2:.5f}')

score_2 = grid_search_2.best_score_
print(f'R_Squared Score: {score_2:.5f}')

Mean Square Error: 501.73082
Root Mean Square Error: 22.39935
R_Squared Score: 0.33205


From the performance of the two models, it is obvious that the Model that is built with `dataset_with_encoded_location.zip` dataset. 