In [51]:
#Master
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
import joblib
#1)Get the Big Picture
#We need to create a code to predict the Housing_Prices of a block in Califonia given certain metrics
#median_house_value is capped to about $500,000, so is median_age, median_income is scaled down and capped at 15
#This is a Regression Task, and not a Classification Task
#2)Get The Data and Split it
HOUSING_PATH='housing.csv'
def load_housing_data(housing_path=HOUSING_PATH):
    return pd.read_csv(housing_path)
housing=load_housing_data()
#as income is an important metric, we need to ensure that the training data contains enough(not equal) represntation from all income categories or strata
#but first, let's create a new attribute called income_category using the pd.cut() function
housing['income_category']=pd.cut(housing['median_income'],bins=[0.,1.5,3.0,4.5,6.,np.inf],labels=[1,2,3,4,5])
instance=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
#n_splits is the number of unique test-training sets we are creating from a dataset, and random state essentially ensures we get the same 'random' value across all runs 
#i.e, random to us but not the computer
for train_index,test_index in instance.split(housing,housing['income_category']):
    #instance.split.. is a generator class object that will create a fresh test-train index Series pair on the spot as the loop repeats according to it's n_splits value
    #this way of spliting ensures that the percentage point of income category in the overall dataset and training set remain the same
    train_set=housing.loc[train_index]
    test_set=housing.loc[test_index]
#now that we have split the dataset into train_set and test_set we can now get rid of income_category in both
for set in [train_set,test_set]:
    set.drop(['income_category'],inplace=True,axis=1)


In [52]:
#esc+r to deactivate esc+y to activate 
#4)Prepare the data for Machine Learning Algorithms
#First let's split the training set into predictors and labels
housing=train_set.drop("median_house_value",axis=1)
housing_labels=train_set['median_house_value'].copy()
#We know that the total_bedrooms column has some empty values, sklearn provides a handy class to take care of missing values called SimpleImputer
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='median')
#but first lets remove the ocean proximity from the dataset since the median can only be calculated for numerical values
housing_num=housing.drop('ocean_proximity',axis=1)
#imputer is an estimator(transfomer) object which estimates some values and transforms a given dataset
imputer.fit(housing_num)
#print(imputer.statistics_)
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)


In [53]:
#esc+r to deactivate esc+y to activate 
#however, this opens up another potential problem, by attatching linear numerical values to the attribute, the algorithm might even look into patterns that aren't even there intentionally
# for example 1 is more closer to 2 than 4, which might just work on relative test values(such a Good, Great, Best etc)
#however inland(1) is much more closer to nearbay(3) than it is to in ocean(0), which might cause a problem
#One way to resolve this will be to make an additional column per category to bring it down to a simple yes(Hot), or No(Cold) question, with Yes being indicated by the number 1 and No being zero
from sklearn.preprocessing import OneHotEncoder
housing_cat=housing[['ocean_proximity']]
cat_encoder=OneHotEncoder()
housing_cat_1hot=cat_encoder.fit_transform(housing_cat)
#print(type(housing_cat_1hot))
#the resulting object is a sparse matrix as opposed to an ndarray, which is useful when you have columns in the thousands, to get an nd array we simply..
#print(housing_cat_1hot.toarray())


In [54]:
#esc+r to deactivate esc+y to activate 
#Custom Transformers
#Although Scikit Learn gives us some useful transformers, you will need to write your own for tasks such as custom cleanup operations or combining specific attributes
#We will need our new transformer to work with Scikit Learn functionalities(such as pipelines), luckily since Scikit Learn relys on Duck Typing and not inheritence, all we need to do is incoperate the three methods(fit, transform,fit_transform) in our new transoformer class
#You can get fit_trandform for free if you include TranformerMixin as a Parent Class. Also, if you add add BaseEstimator as a Parent(and avoid *args, and **kwargs in your constructor) you will get two extra methods
#(get_params() and setparams()) which will aid in hyperparameter tuning 
from sklearn.base import BaseEstimator, TransformerMixin

rooms_index,bedrooms_index,population_index,households_index=3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self #nothing to do
    def transform(self, X, y=None):
        rooms_per_household=X[:,rooms_index]/X[:,households_index]
        population_per_household=X[:,population_index]/X[:,households_index]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_index]/X[:,rooms_index]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]#catenating along 2nd axis
        else:
            return np.c_[X,rooms_per_household,population_per_household]

In [55]:
#esc+r to deactivate esc+y to activate
#Perhaps one of the most useful techniques in Machine Learning is Feature Scaling, with a very few exceptions, most machine learning models, cannot work with vastly different magnitudes of attributes from each other
#Two of the means Sklearn provides are MinMax scaling, and Standardisation
#of the two MinMax scaler works best with most algortihms, as values are simply rescaled ranging from 0 to 1, we do this by subtracting the min value, dividing the max minus min, this method is also called normalisation
#however, this method is also very susceptible to outliers, case in point, if the max value of a dataset is 100 while everything else falls below 15, then most of the dataset will also fall within 0-0.15, which is not ideal
#for this purpose, sklearn also hands us the Standard Scaling tool, which is not affected by outliers much, but does not have any specific range for values, which make it difficult to work with in most algorithms
#first, it substracts the mean value(so standardised values always have zero mean), and then it divides by the standard deviation so that the resulting distibution has unit variance, this method is called Standardisation
#Transformation Pipelines
#we often have to apply sequential transformations to our dataset, to streamline the process sklearn provides us with pipelines that could apply all transformations required in a sequential manner
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline=Pipeline([('imputer',SimpleImputer(strategy='median')),
                     ('attribs-adder',CombinedAttributesAdder()),
                     ('std_scaler',StandardScaler()),])
#When used this way, all but the last estimator must be a transformer, the names of the estimator classes can be anything as long as they do not contain double underscores
#when the pipeline's fit() is called, it will apply fit_transform() to all transformers except for the last one where it will simply stop by calling fit(), likewise transform() and fit_transform() both basically refer to the very last estimator
#but, it is sometimes more advantageous to keep categorial(like KNearest) and numerical(like regression) in the same dataset, for this sklearn introduces a new class ColumnTransformer
from sklearn.compose import ColumnTransformer
num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']

full_pipeline=ColumnTransformer([('num', num_pipeline,num_attribs),
                                ('cat',OneHotEncoder(),cat_attribs),])
housing_final=full_pipeline.fit_transform(housing)
#this applies both pipelines\transformers seperately and then finally catenates the two along the second axis, in case of a mix between sparse and dense matrix, the ColumnTransformer shall only return a Sparse Matrix if the density(of the final sparse product) falls below a given threshold, which by default is set to 0.3
#in this case the pipeline returns a dense matrix(i.e. 2D Arrays)

In [56]:
#esc+r to deactivate esc+y to activate
#5) Select and Train a Model
#Sklearn comes with it's own inbuilt regression models, for starters let's train a linear regression model
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_final,housing_labels)
#..and done, let's test out this model with a portion of the data from the Train set
some_data=housing.iloc[:5]
some_labels=housing_labels.iloc[:5]
some_data_prepped=full_pipeline.transform(some_data)
print('Predictions: ',lin_reg.predict(some_data_prepped))
print('Labels: ', list(some_labels))
#it works!, though the predictions aren't entirely accurate, but to get a measure of it's accuracy, let's measure it using RMSE
from sklearn.metrics import mean_squared_error
housing_predictions=lin_reg.predict(housing_final)
lin_mse=mean_squared_error(housing_labels,housing_predictions)
lin_rmse=np.sqrt(lin_mse)
print(lin_rmse)
#okay, it's better than nothing but it's still not good, this is a classic example of the model underfitting the data, to fix this there are mainly three steps we can take
#Reduce the model's constrants, which is ruledd out since we haven't adjusted it in the slightest
#feeding it better attributes, which is far too difficult
#trying out a better model: Let's train a DecisionTreeRegressor, which is a powerful model capable of finding non linear relationships
from sklearn.tree import DecisionTreeRegressor

tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_final,housing_labels)
housing_predictions=tree_reg.predict(housing_final)
tree_mse=mean_squared_error(housing_labels,housing_predictions)
tree_rmse=np.sqrt(tree_mse)
print(tree_rmse)
#while the rmse shows a value of zero, it's far more likely that the model has drastically overfit the data

Predictions:  [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]
Labels:  [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]
68627.87390018745
0.0


In [58]:
##esc r to deactivate and esc y to activate
#with this we have finally reached the last part of this project which is to test the model out on the test set
final_model=joblib.load('my_model.pkl')
X_test=test_set.drop('median_house_value',axis=1)
Y_test=test_set['median_house_value'].copy()
X_test_prepared=full_pipeline.transform(X_test)
final_predictions=final_model.predict(X_test_prepared)
final_mse=mean_squared_error(Y_test,final_predictions)
final_rmse=np.sqrt(final_mse)
print(final_rmse)#about 48,109
#In some cases such point estimate of the generalisation error may not be enough to to convince you to launch, what if your new model is just 0.1% better than the model currently in production?
#In such cases the precision of the error estimate becomes important, In which case it is often useful to find the confidence interval(usually 95%) of the estimate
from scipy import stats
confidence=0.95
squared_errors=(final_predictions - Y_test)**2
print(np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,loc=squared_errors.mean(),scale=stats.sem(squared_errors))))
#What this means is that if we tested this model with another tes set, there is a 95% chance that the rmse error would lie between the given range

48109.13002677952
[46111.59170126 50026.97167172]
