# California House price Data set - linear regression

In [4]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

ModuleNotFoundError: No module named 'numpy'

## 1) Load Data 

In [3]:
housing = pd.read_csv("housing.csv")
housing.head()   # Quick look at data 

NameError: name 'pd' is not defined

In [None]:
# Quick description of data 
housing.info()

In [None]:
# categories in ocean_proximity column
housing["ocean_proximity"].value_counts()

In [None]:
# Summary of each numerical attribute
housing.describe()

## 2) Doing some Exploratory Data Analysis 

### 1) Looking for correlations & Scatter plots

In [None]:
housing.iloc[:, :-1].corr()["median_house_value"].sort_values() 

In [None]:
scatter_data = housing.iloc[:, [1, 2, 3, -3, -2]]
pd.plotting.scatter_matrix(scatter_data, figsize=(20, 17)) 
plt.show()

Note 

The plots above drawn shows very strong correlation between median_income and median_house_value

### 2) Experimenting with attribute combinations 

In [None]:
# Create new attributes & Checking correlations
rooms_per_household = housing["total_rooms"]/ housing["households"]
housing["median_house_value"].corr(rooms_per_household)

In [None]:
bedrooms_per_room = housing["total_bedrooms"]/ housing["total_rooms"]
housing["median_house_value"].corr(bedrooms_per_room)

In [None]:
population_per_household = housing["population"]/ housing["households"]
housing["median_house_value"].corr(population_per_household)

Note 

We have now two more informative series rooms_per_household &  bedrooms_per_room

### 3) Looking at histograms 

In [None]:
housing.hist(bins=50, figsize=(15, 10))      
plt.show()

Note

1) median_income attribute is capped, can be seen from the histogram

2)housing_median_age and median_house value are also capped, we need to collect proper labels 

## 3) Prepare Data for Machine learning 

### 1) Train test split

In [None]:
# Stratified test train split on the basis of income category 
income_cat = pd.cut(housing["median_income"],
                    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                    labels=[1, 2, 3, 4, 5])

train_set, test_set = train_test_split(housing,
                                       test_size=0.2,
                                       random_state=42,
                                       stratify=income_cat)

In [None]:
train_set.head()

In [None]:
test_set.head()

### 2) Separate labels & features from training set

In [None]:
train_features = train_set.drop("median_house_value", axis=1)
train_labels = train_set["median_house_value"].copy()
train_features.head()

In [None]:
train_labels.head()

### 3) Pipelines & column transformer & Getting prepared training set

In [None]:
# Class to add new columns 

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


# pipeline for numerical attributes 
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                         ('attribs_adder', CombinedAttributesAdder()),
                         ('std_scaler', StandardScaler()),
                        ])


num_attribs = train_features.columns.drop(['ocean_proximity'])
cat_attribs = ["ocean_proximity"]

# Complete pipeline for complete training set transformation
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

In [None]:
train_prepared = full_pipeline.fit_transform(train_features)
train_prepared

### 4) Training and evaluating on training set With linear regression 

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(train_prepared, train_labels)

In [None]:
# Lets check it on few instance of training set
some_data = train_features[:5]
some_labels = train_labels[:5]
some_data_prepared = full_pipeline.transform(some_data)
lin_reg.predict(some_data_prepared)

In [None]:
some_labels

In [None]:
# Measure Regression model RMSE on whole training set
train_predictions = lin_reg.predict(train_prepared)
lin_mse = mean_squared_error(train_labels, train_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse