----------------------------------------------------------------------------------------------------------------------

# Problem Number 3 - California Housing Dataset- SVM Regression

In [3]:
#Collate all import statements here
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

from zlib import crc32
from sklearn.model_selection import StratifiedShuffleSplit,train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV,cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [26]:
#Fetch the Data
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
X = housing['data']
y=housing['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

print(pd.DataFrame(X).info())

(16512, 8) (4128, 8) (16512,) (4128,)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       20640 non-null  float64
 1   1       20640 non-null  float64
 2   2       20640 non-null  float64
 3   3       20640 non-null  float64
 4   4       20640 non-null  float64
 5   5       20640 non-null  float64
 6   6       20640 non-null  float64
 7   7       20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB
None


Note:- Looking at the Data it is very clear that it is a cleaned data without any null values. So I will not be performing any Imputations unlike the exercises on the same dataset in Chapter 2. Also, in the chapter 2 dataset we had an Ocean Proximity column which took categorical values. That seems to be missing here as well.

In [30]:
num_pipeline = Pipeline([
        ('scaler', StandardScaler())
])
transformed_data = num_pipeline.fit_transform(X_train)
svregressor = SVR()
print(transformed_data.shape)
print(y_train.shape)
svregressor.fit(transformed_data,y_train)
predictions = svregressor.predict(transformed_data)
mse = mean_squared_error(predictions, y_train)
print('Training Error is ', mse)

scores = cross_val_score(svregressor, transformed_data, y_train,scoring="neg_mean_squared_error",cv=10)
sqrt_scores = np.sqrt(-scores)
print(scores)

(16512, 8)
(16512,)
Training Error is  0.31526557267334276
[-0.28756067 -0.31107866 -0.34830193 -0.3389931  -0.35599795 -0.33327574
 -0.30949294 -0.34356807 -0.31222674 -0.34812683]


In [31]:
transformed_test_data = num_pipeline.fit_transform(X_test)
print(transformed_test_data.shape)
predictions = svregressor.predict(transformed_test_data)
mse = mean_squared_error(predictions, y_test)
print('Testing Error is ', mse)

(4128, 8)
Testing Error is  0.4297331594746025


Note:- We will do a Grid SearchCV to see which kernel performs best and then finetune hyperparameters on the selected kernel.

In [20]:
from sklearn.model_selection import GridSearchCV

#Optimise and find the best hyper params

param_grid = [
    {},
]
grid_search_cv = GridSearchCV(svregressor,param_grid,scoring="neg_mean_squared_error",cv=10,return_train_score=True)
grid_search_cv.fit(transformed_data,y_train)

#Exploring results from Grid Search CV
print(grid_search_cv.best_params_)
print(grid_search_cv.best_score_)


{'kernel': 'linear'}
-1.41528661571761


Note:- Looking at the above output, it is clear that Linear kernel gives us a good accuracy. So let us do a randomisedsearchCV with different hyper parameters

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#Optimise and find the best hyper params

param_grid = [
    {"C" :[1,10,50,75,100], "epsilon":[0.001,0.01,0.1,1],"kernel":["linear", "poly", "rbf", "sigmoid"]},
]
rand_search_cv = RandomizedSearchCV(svregressor,param_grid,scoring="neg_mean_squared_error",cv=10,return_train_score=True)
rand_search_cv.fit(transformed_data,y_train)

#Exploring results from Grid Search CV
print(rand_search_cv.best_params_)
print(rand_search_cv.best_score_)


With the Randomised Search and Grid Search we can conclude that the {'kernel': 'linear'}, {'epsilon': 1, 'C': 10} give the best values. So it will be good idea to train the model with those parameters

In [28]:
num_pipeline = Pipeline([
        ('scaler', StandardScaler())
])
transformed_data = num_pipeline.fit_transform(X_train)
svregressor = SVR(kernel='linear',C=10, epsilon=1)
print(transformed_data.shape)
print(y_train.shape)
svregressor.fit(transformed_data,y_train)
predictions = svregressor.predict(transformed_data)
mse = mean_squared_error(predictions, y_train)
print('Training Error is ', mse)


(16512, 8)
(16512,)
Training Error is  0.5520495011879758


In [29]:
transformed_test_data = num_pipeline.fit_transform(X_test)
print(transformed_test_data.shape)
predictions = svregressor.predict(transformed_test_data)
mse = mean_squared_error(predictions, y_test)
print('Testing Error is ', mse)

(4128, 8)
Testing Error is  0.6050316603651753


But when we look at the Training and Test results we can conclude that marginally plain SVR (kernel = rbf, c=1.0 and epsilon=0.01) gives better results and we will use that as the final model

---------------------------------------------------------------------------------------------------------------------

# Problem 2 - MNIST Problem 

In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
X, y = mnist["data"], mnist["target"]
X.shape


(70000, 784)

In [4]:
#Plotting and saving the digits
import matplotlib.pyplot as plt
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")
    
plt.figure(figsize=(9,9))
example_images = X[:100]
plot_digits(example_images, images_per_row=10)
save_fig("more_digits_plot")
plt.show()

NameError: name 'PROJECT_ROOT_DIR' is not defined