In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error,
)

In [2]:
dataset_path = './Content/auto_insurance.csv'
df = pd.read_csv(
    dataset_path,
    names=[
        'n_claims',
        'total_payment'
    ]
)

In [3]:
df.head()

Unnamed: 0,n_claims,total_payment
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   n_claims       63 non-null     int64  
 1   total_payment  63 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 1.1 KB


In [6]:
df.describe()

Unnamed: 0,n_claims,total_payment
count,63.0,63.0
mean,22.904762,98.187302
std,23.351946,87.327553
min,0.0,0.0
25%,7.5,38.85
50%,14.0,73.4
75%,29.0,140.0
max,124.0,422.2


In [7]:
normalizer = StandardScaler()
df_normalized = normalizer.fit_transform(df)

In [8]:
df_normalized

array([[ 3.67330185e+00,  3.39728625e+00],
       [-1.68556660e-01, -6.00095564e-01],
       [-4.27558357e-01, -9.52160668e-01],
       [ 4.36397304e+00,  3.74011686e+00],
       [ 7.37949280e-01,  2.44860684e-01],
       [ 1.47178742e+00,  8.39331269e-01],
       [ 4.11113805e-03, -4.76584200e-01],
       [-3.84391408e-01, -2.38795966e-01],
       [ 9.53784027e-01,  1.33683966e+00],
       [-5.57059206e-01, -3.79622008e-01],
       [-7.72893953e-01, -8.92136453e-01],
       [ 1.08328488e+00,  1.73045999e+00],
       [-5.13892256e-01, -8.62124346e-01],
       [ 4.11113805e-03, -6.76280144e-01],
       [-6.86560054e-01, -5.70083457e-01],
       [-9.02394802e-01, -1.05720304e+00],
       [ 4.72780876e-02,  4.23779015e-01],
       [-7.29727004e-01, -5.45842909e-01],
       [-8.59227852e-01, -1.08259790e+00],
       [ 4.11113805e-03,  1.70984728e-01],
       [-7.29727004e-01, -9.62549474e-01],
       [-6.00226155e-01, -5.71237769e-01],
       [-6.00226155e-01, -5.31991167e-01],
       [-8.

In [9]:
X, y = df_normalized[:, 0], df_normalized[:, 1]
X = X.reshape(-1, 1)

In [10]:
val_size = 0.2
text_size = 0.125
random_state = 0
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train,
    y_train,
    test_size=text_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [11]:
print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')
print(f'Number of test samples: {X_test.shape[0]}')

Number of training samples: 43
Number of validation samples: 13
Number of test samples: 7


In [12]:
regressor = SVR()
regressor.fit(X_train, y_train)

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [13]:
y_val_pred = regressor.predict(X_val)
y_test_pred = regressor.predict(X_test)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print('Evaluation results on validation and test set:')
print(f'Mean Absolute Error\tVal: {val_mae}\tTest: {test_mae}')
print(f'Mean Squared Error\tVal: {val_mse}\tTest: {test_mse}')

Evaluation results on validation and test set:
Mean Absolute Error	Val: 0.40454347318473977	Test: 0.328201662068015
Mean Squared Error	Val: 0.22296817206232716	Test: 0.14872161499412404
