### Exercise project 3 – Support Vector Machines
### In this project I will use SVR = Support Vector Regression (type of
### SVM for regression). I will use my previous regression dataset "California Housing Prices" 
### (https://www.kaggle.com/datasets/camnugent/california-housing-prices). 
### Here will be used the same dataset optimizations as in Project_1.
## target variable: "median house value"

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [13]:
 # load data 
df = pd.read_csv("housing.csv")

In [14]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


### Cleaning up the dataset (the same steps were implemented in Project_1, so I will just include the cleaning code here without comments, as it was already explained earlier)

In [15]:
df = df.drop(["latitude","longitude"], axis=1)

In [None]:
# use of OneHotEncoder:
from sklearn.preprocessing import OneHotEncoder
variables = ["ocean_proximity"]
             
# use encoder:
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

In [None]:
df = df.drop("ocean_proximity_ISLAND", axis=1)

In [18]:
# we will use quantiles to remove the top % in order to get closer to normal distribution
df = df.query("median_house_value < median_house_value.quantile(0.93)")
df = df.query("median_income < median_income.quantile(0.99)")
df = df.query("total_rooms < total_rooms.quantile(0.97)")
df = df.query("total_bedrooms < total_bedrooms.quantile(0.98)")

In [None]:
# The same amount of rows as in Project_1 after cleaning the data.
df.describe()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
count,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0,17878.0
mean,29.006544,2261.76004,470.512026,1283.287057,439.904464,3.552939,182902.142074,0.432934,0.337789,0.104542,0.124567
std,12.268555,1253.360037,251.723612,731.430934,233.756031,1.420325,87622.843126,0.495496,0.47297,0.305971,0.330236
min,1.0,2.0,2.0,3.0,2.0,0.4999,14999.0,0.0,0.0,0.0,0.0
25%,19.0,1400.25,291.0,782.0,276.0,2.481475,112925.0,0.0,0.0,0.0,0.0
50%,29.0,2024.0,422.0,1148.0,397.0,3.375,168300.0,0.0,0.0,0.0,0.0
75%,37.0,2892.0,607.0,1642.0,567.0,4.45435,237500.0,1.0,1.0,0.0,0.0
max,52.0,7430.0,1307.0,8733.0,1274.0,7.898,431600.0,1.0,1.0,1.0,1.0


#### Scale the regression target for SVR:

In [20]:
# initalize scaler
scalerY = MinMaxScaler()

# name of target variable
target_variable = "median_house_value"

# save min/max -values of target value
# for bettermetrics later
min_y = df[target_variable].min()
max_y = df[target_variable].max()

# scale y-variable
num_vars = [target_variable]
df[num_vars] = scalerY.fit_transform(df[num_vars])

####  Train/test split

In [23]:
# first, we split into X and y
# in other words, we split the dataset into support variables (X)
# and the target variable (y)

# X = list of all support variables you want to use
# while predicting the target variable with your model

# a small trick => use everything else except, the target
X = df.drop("median_house_value", axis=1)

# our target variable is y
y = df['median_house_value']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)