### Exercise project 3 – Support Vector Machines
### In this project I will use SVR = Support Vector Regression (type of
### SVM for regression). I will use my previous regression dataset "California Housing Prices" 
### (https://www.kaggle.com/datasets/camnugent/california-housing-prices). 
### The cleaning/optimisation phase is copied from Project_1, so you can scrow down straight to  
### "scale of  the regression target for SVR" phase. 
## Target variable: "median house value". The cleaning phase copied from Project_1

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [54]:
 # load data 
df = pd.read_csv("housing.csv")

In [55]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [56]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


### Cleaning up the dataset (the same steps were implemented in Project_1, so I will just include the cleaning code here without comments, as it was already explained earlier)

In [57]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [58]:
df = df.dropna(axis=0)

In [59]:
df = df.drop(["latitude","longitude"], axis=1)

In [60]:
# use of OneHotEncoder:
from sklearn.preprocessing import OneHotEncoder
variables = ["ocean_proximity"]
             
# use encoder:
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

In [61]:
df.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155,0.442128,0.317917,0.000245,0.111095,0.128615
std,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099,0.496652,0.465678,0.015641,0.314257,0.334782
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0,0.0,0.0,0.0,0.0,0.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0,1.0,1.0,0.0,0.0,0.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,1.0,1.0,1.0,1.0,1.0


In [62]:
df = df.drop("ocean_proximity_ISLAND", axis=1)

In [63]:
# we will use quantiles to remove the top % in order to get closer to normal distribution
df = df.query("median_house_value < median_house_value.quantile(0.93)")
df = df.query("median_income < median_income.quantile(0.99)")
df = df.query("total_rooms < total_rooms.quantile(0.97)")
df = df.query("total_bedrooms < total_bedrooms.quantile(0.98)")

In [64]:
# The same amount of rows as in Project_1 after cleaning the data.
df.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0,17879.0
mean,29.004419,2262.543207,470.680799,1283.719224,440.067174,3.55239,182881.419319,0.432854,0.337882,0.104536,0.12456
std,12.267511,1254.59576,251.947257,731.908489,233.98798,1.419586,87606.683877,0.495485,0.473001,0.305963,0.330228
min,1.0,2.0,2.0,3.0,2.0,0.4999,14999.0,0.0,0.0,0.0,0.0
25%,19.0,1400.5,291.0,782.0,276.0,2.48125,112900.0,0.0,0.0,0.0,0.0
50%,29.0,2024.0,422.0,1148.0,397.0,3.375,168200.0,0.0,0.0,0.0,0.0
75%,37.0,2892.5,607.0,1643.0,567.0,4.45375,237500.0,1.0,1.0,0.0,0.0
max,52.0,7436.0,1311.0,8733.0,1295.0,7.8774,431600.0,1.0,1.0,1.0,1.0


#### Scale the regression target for SVR:

In [65]:
# initalize scaler
scalerY = MinMaxScaler()

# name of target variable
target_variable = "median_house_value"

# save min/max -values of target value
# for bettermetrics later
min_y = df[target_variable].min()
max_y = df[target_variable].max()

# scale y-variable
num_vars = [target_variable]
df[num_vars] = scalerY.fit_transform(df[num_vars])

####  Train/test split

In [66]:
# first, we split into X and y
# in other words, we split the dataset into support variables (X)
# and the target variable (y)

# X = list of all support variables you want to use
# while predicting the target variable with your model

# a small trick => use everything else except, the target
X = df.drop("median_house_value", axis=1)

# our target variable is y
y = df['median_house_value']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)