# Support Vector Regressor Model 

In this notebook, I trained a support vector machine on 2 different datasets. These datasets are made different due to their independent variables and the method of handling categorical columns.

In [37]:
import pandas as pd
from sklearn.svm import SVR
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

## First Dataset

In [6]:
# Load the data file into a DataFrame
df = pd.read_csv('../data/dataset_with_encoded_location.zip', compression='zip')
df.head()

Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,L1,L2,L3,L4,L5,...,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
0,2,3,62,3,1440,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,95,3,1521,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,51,2,1200,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,63,3,1310,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,70,3,1800,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X = df.drop('price', axis=1)
y= df['price']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [18]:
X_train.shape

(5596, 20)

### Iteration

In [19]:
reg = make_pipeline(StandardScaler(), SVR())

params = {
    'svr__kernel': ['linear', 'poly', 'rbf'],
    'svr__C': [0.01, 0.1, 1, 10, 100],
    'svr__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(reg, params, cv=5)

grid_search.fit(X_train, y_train)

In [21]:
# Best model parameters
grid_search.best_params_

{'svr__C': 100, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}

In [23]:
# Best estimator
best_model = grid_search.best_estimator_

In [35]:
# Evaluate model performance

predictions = best_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(f'Mean Square Error: {mse:.5f}')

rmse = np.sqrt(mse)
print(f'Root Mean Square Error: {rmse:.5f}')

score = grid_search.best_score_
print(f'R_Squared Score: {score:.5f}')

Mean Square Error: 8.76006
Root Mean Square Error: 2.95974
R_Squared Score: 0.98935


## Second Dataset

In [44]:
# Load the data file into a DataFrame
# df = pd.read_csv('../data/building_dataset.zip', compression='zip', index_col=0)
# df.head()