In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packaes to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 

**Load the data**

In [None]:
housing=pd.read_csv('../input/california-housing-prices/housing.csv')
housing.head()

In [None]:
housing.info()

the type of "ocean_proximity" is object but since it's loaded from a csv file it must be a text attribute

In [None]:
housing['ocean_proximity']

In [None]:
housing.describe()


In [None]:
housing.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
def split_train_test(data,test_ratio):
    np.random.seed(42)
    shuffled_indices=np.random.permutation(len(data))
    test_set_size=int(len(data)*test_ratio)
    test_indices=shuffled_indices[:test_set_size]
    train_indices=shuffled_indices[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [None]:
train_set,test_set=split_train_test(housing,0.2)
train_set.head()


In [None]:
print(len(train_set),'train +',len(test_set),'test')

In [None]:
import seaborn as sns
sns.heatmap(housing.corr(),vmin=-1,vmax=1,annot=True)

as we see there is an important relation between median_house_value and median_income

so , let's create an income category attribute by dividing the median income by 1.5

In [None]:
housing['median_income'].hist(bins=30,figsize=(20,15))

most median values are clustred around 2-5 but some income go far beyond 6.

In [None]:
housing['income_cat']=np.ceil(housing['median_income']/1.5)

In [None]:
housing.income_cat


Let's merge all the categories grater than 5 into category 5


In [None]:
housing['income_cat'].where(housing['income_cat']<5,5.0,inplace=True)

In [None]:
housing.income_cat



In [None]:
housing.columns



> Visualize our data

In [None]:
#Scatter plot
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)

===> it looks like California , we can see the high-density areas , Bay area around Los Angelos and San Diego

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',s=housing['population']/100,c="median_house_value",label='population',cmap=plt.get_cmap("jet"),colorbar=True,alpha=0.1)
plt.legend()

the housing prices are very much related tp the location and to the population density 

In [None]:
corr_matrix=housing.corr()
print(corr_matrix)


Let's see how much attributes correlates with the median house value

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

As we see the most promosing attribute to predict the median house value is the median income ( 0.688), so let's zoom in on their correlation scatterplot

In [None]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)

this correlation is very strong, the points are not too dispersed 

Now, after discovering our data , we want to clean it !

In [None]:
housing.head()

the total rooms doesn't give us any additional information , but i think that the number of rooms per household will help us !

In [None]:
housing['rooms_per_household']=housing['total_rooms']/housing['households']
housing['bedrooms_per_room']=housing['total_bedrooms']/housing['total_bedrooms']
housing['population_per_household']=housing['population']/housing['households']

Let's look at the correlation matrix again

In [None]:
corr_matrix=housing.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

Not bad ! , as we see the new rooms_per_household  is much more correlated with the median house value

Data Cleaning:

In [None]:
X=housing.drop('median_house_value',axis=1)
Y=housing['median_house_value']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
x_train.info()

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')

In [None]:
#median can only computedon numerical attribues 
#We need to drop ocean_proximity column 
housing_num=x_train.drop('ocean_proximity',axis=1)
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

Now , we will use the trained imputer to transform the training set by replacing the missing values

In [None]:
housing_imputed=imputer.transform(housing_num)

In [None]:
housing_imputed

This will return a numpy array, Let's back into a Pandas DataFrame

In [None]:
housing_trainig=pd.DataFrame(housing_imputed,columns=housing_num.columns)

In [None]:
from sklearn.preprocessing import LabelEncoder
#We instanciate our encoder
encoder=LabelEncoder()
#We extract our column that is going to be encoded 
housing_cat=x_train['ocean_proximity']
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_cat_encoded

In [None]:
print(encoder.classes_)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin
rooms_ix,bedrooms_ix, population_ix,household_ix=3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else :
            return np.c_[X,rooms_per_household,population_per_household]
        
attr_adder=CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attributes=attr_adder.transform(housing.values)


> **Featue Scaling**

In [None]:
housing_num.head()

As we see the numerical attributes has different scales, this will cause a problem when we use Machine Learning Algorithms.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
num_pipeline=Pipeline([('imputer',SimpleImputer(strategy='median')),
                  
                        ('attribs_addr',CombinedAttributesAdder()),
                        ('std_scaler',StandardScaler())
                       ])

In [None]:
housing_num_tr=num_pipeline.fit_transform(housing_num)

Now , let's come back to categorical columns

In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values
    

In [None]:
class SupervisionFriendlyLabelBinarizer(LabelBinarizer):
    def fit_transform(self, X, y=None):
        return super(SupervisionFriendlyLabelBinarizer,self).fit_transform(X)


In [None]:
num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']
num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attribs)),
    ('imputer',SimpleImputer()),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler()),
])

In [None]:
cat_pipeline=Pipeline([
    ('selector',DataFrameSelector(cat_attribs)),
    ('label_binarizer',SupervisionFriendlyLabelBinarizer()),
    
])

In [None]:
full_pipeline=FeatureUnion(transformer_list=[
    ('num_pipeline',num_pipeline),
    ('cat_pipeline',cat_pipeline)
])

In [None]:
#Let's run the whole pipeline

housing_prepared=full_pipeline.fit_transform(x_train)

In [None]:
housing_prepared

In [None]:
from sklearn.linear_model import LinearRegression
#Instanciate our model
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,y_train)

In [None]:
some_data=x_train.iloc[:5]
some_labels=y_train.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)
print('Predictions :\t',lin_reg.predict(some_data_prepared))

In [None]:
print('Labes :\t',list(some_labels))

Let's evaluate:

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(y_train,housing_predictions)
print(np.sqrt(lin_mse))

Let's select more powerful model "DecisionTreeRegressor"

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,y_train)

In [None]:
#Let's Predict
housing_predictions=tree_reg.predict(housing_prepared)

In [None]:
#Let's evaluate
tree_mse=mean_squared_error(y_train,housing_predictions)

In [None]:
print(np.sqrt(tree_mse))

==> this is not good at all , this means that we are suffering from the 'overfitting' problem , to solve this we are going to use scikit learn cross-validation

In [None]:
from sklearn.model_selection import cross_val_score


display the scores

In [None]:
def display_scores(scores):
    print("Scores",scores)
    print("Mean",scores.mean())
    print("Standard Deviation :",scores.std())

In [None]:
mse_tree_scores=cross_val_score(tree_reg,housing_prepared,y_train,scoring='neg_mean_squared_error',cv=10)
rmse_tree_scores=np.sqrt(-mse_tree_scores)
display_scores(rmse_tree_scores)

In [None]:
mse_lin_scores=cross_val_score(lin_reg,housing_prepared,y_train,scoring='neg_mean_squared_error',cv=10)
rmse_lin_scores=np.sqrt((-mse_lin_scores))
display_scores(rmse_lin_scores)


The two models are so bad , let's try another RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_prepared,y_train)

In [None]:
#Let's predict
housing_predictions=forest_reg.predict(housing_prepared)
mse_forest=mean_squared_error(y_train,housing_predictions)
print(np.sqrt(mse_forest))

In [None]:
mse_forest_scores=cross_val_score(forest_reg,housing_prepared,y_train,scoring='neg_mean_squared_error',cv=10)
rmse_forest_scores=np.sqrt((-mse_forest_scores))
display_scores(rmse_forest_scores)

Fine tune our model

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid=[{'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
            {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},]
forest_reg=RandomForestRegressor()
grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared,y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
scores=grid_search.cv_results_
for mean_score,params in zip(scores['mean_test_score'],scores['params']):
    print(np.sqrt(-mean_score),params)

===> as we see the best score when we set our parameters to 6 as max_features and 30 n_estimators

In [None]:
final_model=grid_search.best_estimator_

In [None]:
x_test_prepared=full_pipeline.transform(x_test)

In [None]:
#Now, let's predict 
final_predictions=final_model.predict(x_test_prepared)
#Let's evaluate our model
final_mse=mean_squared_error(y_test,final_predictions)
print(np.sqrt(final_mse))

Let's load our modal

In [None]:
import joblib
filname='finalized_model.csv'
joblib.dump(final_model, filename)