In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 1.2 讀取 .csv

In [8]:
data = pd.read_csv('housing.csv')
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [9]:
data.info() #data.isnull().any()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [10]:
data.dropna(inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [12]:
x = np.array(data)[:, :8].astype(np.float32)
x

array([[-1.2223e+02,  3.7880e+01,  4.1000e+01, ...,  3.2200e+02,
         1.2600e+02,  8.3252e+00],
       [-1.2222e+02,  3.7860e+01,  2.1000e+01, ...,  2.4010e+03,
         1.1380e+03,  8.3014e+00],
       [-1.2224e+02,  3.7850e+01,  5.2000e+01, ...,  4.9600e+02,
         1.7700e+02,  7.2574e+00],
       ...,
       [-1.2122e+02,  3.9430e+01,  1.7000e+01, ...,  1.0070e+03,
         4.3300e+02,  1.7000e+00],
       [-1.2132e+02,  3.9430e+01,  1.8000e+01, ...,  7.4100e+02,
         3.4900e+02,  1.8672e+00],
       [-1.2124e+02,  3.9370e+01,  1.6000e+01, ...,  1.3870e+03,
         5.3000e+02,  2.3886e+00]], dtype=float32)

In [13]:
y = np.array(data)[:, 8].astype(np.float32)
y

array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.],
      dtype=float32)

In [14]:
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=1)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, SGDRegressor, Lasso, ElasticNet
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [16]:
paramGrid = [{
    'poly__degree':[2,3,4],
    'sgd__alpha':[0.1, 1, 2, 3, 4, 5],
    'sgd__penalty':['l1', 'l2', 'elasticnet'],
    'sgd__l1_ratio':[0,.3,.7,1]
}]

In [17]:
ridgeSgd = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),
    ('scal', StandardScaler()),
    ('sgd', SGDRegressor(random_state=1))
])

In [18]:
gridSearch = GridSearchCV(ridgeSgd, paramGrid, scoring='r2', cv=3)

In [19]:
gridSearch.fit(xTrain,yTrain.ravel())

In [20]:
gridSearch.best_score_

0.6163560602168356

In [21]:
gridSearch.best_params_

{'poly__degree': 2,
 'sgd__alpha': 0.1,
 'sgd__l1_ratio': 0,
 'sgd__penalty': 'l2'}

In [22]:
gridModel = gridSearch.best_estimator_

gridModel.score(xTest,yTest), gridModel.score(xTrain,yTrain)

(0.5701524549353258, 0.4078933311075412)

In [23]:
#用ridge regression, degree為2, alpha為2的模型為最佳參數