# Linear Regression

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score, cross_validate, learning_curve, ShuffleSplit

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler


In [4]:
np.random.seed(306)

Setting random seed to 306

In [16]:
shuffle_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.style.use("ggplot")

shuffle_split create 10 folds through shuffle split by keeping aside 20% examples as test in each fold

# STEP 1 Load the dataset

In [11]:
import ssl
import urllib.request
from sklearn.datasets import fetch_california_housing

ssl._create_default_https_context = ssl._create_unverified_context

urllib.request.install_opener(
    urllib.request.build_opener(
        urllib.request.HTTPSHandler(context=ssl._create_default_https_context())
    )
)
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)

print(features.head())
print(labels.head())



   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  
0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64


In [12]:
print("Shape of feature matrix", features.shape)
print("Shape of label vector", labels.shape)

Shape of feature matrix (20640, 8)
Shape of label vector (20640,)


# Preprocessing

In [17]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(
    features, labels, random_state=42
)

In [18]:
print("# training samples:", train_features.shape[0])
print('# test samples: ', test_features.shape[0])

# training samples: 15480
# test samples:  5160


In [19]:
assert(train_features.shape[0]==train_labels.shape[0])
assert(test_features.shape[0]==test_labels.shape[0])

In [None]:
lin_reg_pipeline = Pipeline( [ ('feature_scaling', StandardScaler()),
                               ('lin_reg', LinearRegression() ) 
                            ] )