# California Housing Price prediction - Regression analysis
- Tabular data preprocessing
- Visualizing for insight 
- Data transformations and feature scaling
- Linear Regression model

### Importing all necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Read the housing data CSV file

In [None]:
housing = pd.read_csv("data/housing.csv")

In [None]:
# Displaying the first five data
housing.head()

### Brief overview of the contents

In [None]:
housing.info()

### Important inferences

1. __total_bedrooms__ contains only 20433 non-null. This means, there are 207 values missing.

    _This should be taken care of later_
    
    
2. __ocean_proximity__ is not numeric, and we assume it is string.

### Brief overview of various statistics

In [None]:
housing.describe()

### Note:
_ocean proximity_ is not displayed because it is a __categorical attribute__.

In [None]:
# All column names of the dataframe can be accessed.
housing.columns

### Analyzing the _ocean proximity_ attribute

In [None]:
housing['ocean_proximity'].value_counts()

### Histogram plot of various attributes

In [None]:
housing.hist(bins=50, figsize=(20,15))

### Standard deviation $\sigma$

1. 68% of data falls within $1\sigma$
2. 95% of data falls within $2\sigma$
3. 99.7% of data falls within $3\sigma$

### Splitting your data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# It is possible to split only one array/dataframe
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [None]:
print(train_set.shape, test_set.shape)

In [None]:
train_set.head()

In [None]:
test_set.head()

### Stratified splitting in regression problem

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

### Median income seems to be a very useful attribute. 
__When we split the data, it is necessary to have a stratified split of people belonging to various income group__

In [None]:
# Categorize people into various income group
housing['income_category'] = pd.cut(housing['median_income'], 
                                   bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
                                   labels=[1, 2, 3, 4, 5])

In [None]:
# Now, there is an additional column which contains 'income_category'
housing.head()

In [None]:
housing['income_category'].hist()

### Give equal weightage to people from all categories when splitting into training and testing

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
# Even if n_split=1, still we need to use the for loop obtain a single split
for train_idx, test_idx in split.split(housing, housing['income_category']):
    stratified_train_set = housing.loc[train_idx]
    stratified_test_set = housing.loc[test_idx]

In [None]:
# Displays the stratified split for each category
stratified_train_set['income_category'].value_counts()/len(stratified_train_set)

In [None]:
stratified_train_set.head()

### After performing stratified split, we no longer need the 'income_category' attribute.

In [None]:
stratified_train_set.drop('income_category', axis=1, inplace=True)
stratified_test_set.drop('income_category', axis=1, inplace=True)

In [None]:
stratified_train_set.head()

### Visualize the data

__Taking a copy of the train set and analyzing it__

In [None]:
h_data = stratified_train_set.copy()

In [None]:
h_data.plot(kind='scatter', x='longitude', y='latitude')

__Use alpha parameter. This helps to understand high density area__

In [None]:
h_data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.2)

__Also possible to obtain colored scatter__

In [None]:
h_data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
           s=h_data['population']/100, label='population', figsize=(20,10),
           c='median_house_value', cmap=plt.get_cmap('jet'), colorbar=True)
plt.show()

### Looking at correlations

In [None]:
corr_matrix = h_data.corr()

In [None]:
# Not a pretty print
print(corr_matrix)

In [None]:
corr_matrix

### Viewing the correlation values relevant to 'median_house_value'

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
from pandas.tools.plotting import scatter_matrix

In [None]:
attribs = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(h_data[attribs], figsize=(12,8))

In [None]:
h_data.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.2)

### Data cleaning

In [None]:
X_train = stratified_train_set.drop('median_house_value', axis=1)
y_train = stratified_train_set['median_house_value'].copy()

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
X_test = stratified_test_set.drop('median_house_value', axis=1)
y_test = stratified_test_set['median_house_value'].copy()
print(X_test.shape, y_test.shape)

In [None]:
housing.info()

__Handle missing features (total_bedrooms)__

Options
1. Remove the row corresponding to missing values
2. Remove the entire attribute
3. Repalce the missing values with: zero, mean or median

In [None]:
# Option 1 (remove the row)
op1 = X_train.dropna(subset=['total_bedrooms'])

In [None]:
op1.info()

In [None]:
X_train.shape

In [None]:
X_train.isna().sum()

In [None]:
#option 2 (remove the entire attribute)
op2 = X_train.drop('total_bedrooms', axis=1)

In [None]:
op2.info()

In [None]:
#option 3 (Fill the missing values with median)
median_val = X_train['total_bedrooms'].median()
op3 = X_train.copy()
op3['total_bedrooms'].fillna(median_val, inplace=True)

In [None]:
op3.head()

In [None]:
op3.info()

### Data Transformation - Categorical attributes (handling 'ocean_proximity')

In [None]:
X_train.info()

In [None]:
X_train['total_bedrooms'].fillna(median_val, inplace=True)

In [None]:
X_train.info()

In [None]:
oprox_train = X_train['ocean_proximity']
oprox_train.head(15)

### Display different types of categorical values inside ocean_proximity

In [None]:
np.unique(oprox_train)

### Converting categorical to numeric
- OrdinalEncoder
- OneHotEncoder
- LabelEncoder
- LabelBinarizer

### OrdinalEncoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ordEncoder = OrdinalEncoder()

In [None]:
oprox_ordEncoder = ordEncoder.fit_transform(oprox_train.values.reshape(-1,1))

In [None]:
oprox_ordEncoder

### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
oneHot = OneHotEncoder()

In [None]:
oprox_OneHot = oneHot.fit_transform(oprox_train.values.reshape(-1,1))

In [None]:
oprox_OneHot

In [None]:
oprox_OneHot.toarray()

### LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lEncoder = LabelEncoder()

In [None]:
oprox_lEncoder = lEncoder.fit_transform(oprox_train)

In [None]:
oprox_lEncoder

In [None]:
oprox_lEncoder.shape

In [None]:
oprox_ordEncoder.shape

### LabelBinarizer

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lBinarizer = LabelBinarizer()

In [None]:
oprox_lBinarizer = lBinarizer.fit_transform(oprox_train)

In [None]:
oprox_lBinarizer

In [None]:
oprox_lBinarizer.shape

In [None]:
oprox_OneHot.shape

### Here we choose the OrdinalEncoder based transformation.

In [None]:
X_train['ocean_proximity'] = oprox_ordEncoder

In [None]:
X_train.info()

### To transform test set, use the same object used to transform the train set 

In [None]:
oprox_test = X_test['ocean_proximity']
oprox_test_ordEncoder = ordEncoder.transform(oprox_test.values.reshape(-1,1))

In [None]:
X_test.info()

In [None]:
X_test['ocean_proximity'] = oprox_test_ordEncoder

In [None]:
X_test.info()

In [None]:
# Replace NaN with median value of training set
X_test['total_bedrooms'].fillna(median_val, inplace=True)

In [None]:
X_test.info()

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_linearRegression = LinearRegression()

In [None]:
model_linearRegression.fit(X_train, y_train)

In [None]:
r2Score = model_linearRegression.score(X_test, y_test)
print(r2Score)

### Feature scaling - Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
sc.fit(X_train)

In [None]:
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
model_linearRegression_std = LinearRegression()
model_linearRegression_std.fit(X_train_std, y_train)
r2Score_std = model_linearRegression_std.score(X_test_std, y_test)
print(r2Score_std)