In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
housing_df=pd.read_csv(r"C:\Users\joseb\Documents\GitHub\Machine-Learning-Python\California Housing Prices\housing.csv")
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [2]:
#Info about the variables in the dataset
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [None]:
#How many distinct values exist in each category of ocean proximity 
housing_df["ocean_proximity"].value_counts()

In [4]:
#Summary of numerical features
housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [None]:
#Histogram of numerical variables
housing_df.hist()
plt.show()

# Summary of issues

- The median house value , median income and housing median age have been pre capped . Since house value is the target feature this might present a problem 

- All the distributions are skewed which could present a problem for machine learning algorithms

- Features have different scales.


In [5]:
#Creation of a test set
def split_train_test(data,test_ratio):
    shuffled_indices=np.random.permutation(len(data))#randomizes the order in the dataset
    test_set_size=int(len(data)*test_ratio)#estimates the size of the test set
    test_indices=shuffled_indices[:test_set_size]#Paritions the test set
    train_indices=shuffled_indices[test_set_size:]#Partitions the training set
    return data.iloc[train_indices],data.iloc[test_indices]
train_set,test_set=split_train_test(housing_df,0.2)
print(len(train_set), "train +", len(test_set),"test")

train_set.head()

16512 train + 4128 test


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
4734,-118.38,34.05,52,2053,480.0,900,417,3.0707,417900,<1H OCEAN
7328,-118.17,33.97,33,2410,641.0,2106,593,2.2422,168200,<1H OCEAN
16989,-122.26,37.54,16,2118,333.0,770,318,7.2477,376000,NEAR BAY
3286,-122.53,39.09,11,1264,271.0,370,177,1.3,69700,INLAND
423,-122.26,37.88,52,2551,417.0,894,404,6.2425,391800,NEAR BAY


# Stratified test set sampling

For the excercise , the median income continous variable will be divided ("binned") into 5 distinct categories 

In [17]:
#Creates an income category attribute by dividing the median income by 1.5, rounds up using ceil to have discrete categories
housing_df["income_cat"]=np.ceil(housing_df["median_income"]/1.5)

In [18]:
#Merges all the categories greater than 5 into category 5 
housing_df["income_cat"].where(housing_df["income_cat"]<5,5.0,inplace=True)

The idea behind the stratified shuffling is split the dataset in a non-random way to avoid inducing a sampling bias. 

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit

In [19]:
split =StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing_df,housing_df["income_cat"]):
    strat_train_set=housing_df.loc[train_index]
    strat_test_set=housing_df.loc[test_index]

The for loop in the previous code block is traversing the housing_df "income_cat" variable in order to generate two indizes , train_index and test_index, then , it localizes the observations of each index to be added to new data frames (training and test)

In [20]:
housing_df["income_cat"].value_counts()/len(housing_df)


3.0    0.350581
2.0    0.318847
4.0    0.176308
5.0    0.114438
1.0    0.039826
Name: income_cat, dtype: float64

In [21]:
#Income cat is removed so the data is back to its original state
for set in (strat_train_set,strat_test_set):
    set.drop(["income_cat"],axis=1,inplace=True)

# Visualization

In the visualization part we galnce the data to get a further general understanding of the data that we are manipulating , for this stage the test set should be put aside and only the training set should be explored

In [22]:
housing_training=strat_train_set.copy()
housing_labels=strat_train_set["median_house_value"].copy()

In [None]:
housing_training.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)

For the next plot we are going to observe the density of registered district with a shade that illustrates the price scale

In [None]:
housing_training.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
                     s=housing_training["population"]/100,label="population",
                     c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()

In [None]:
corr_matrix=housing_training.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
attributes=["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing_training[attributes],figsize=(12,8))


The most promising  feature to predict the median house value is the median income so an individual graphich is produced for this relationship

In [None]:
housing_training.plot(kind="scatter",x="median_income",y="median_house_value",alpha=1)

In [7]:
#Cleaning missing data using Imputer
from sklearn.impute import SimpleImputer as Imputer
imputer=Imputer(strategy="median") #Creates instance of imputer class
housing_num=housing_df.drop("ocean_proximity",axis=1)# Drops categorical feature
imputer.fit(housing_num)
imputer.statistics_

array([-1.1849e+02,  3.4260e+01,  2.9000e+01,  2.1270e+03,  4.3500e+02,
        1.1660e+03,  4.0900e+02,  3.5348e+00,  1.7970e+05])

In [None]:
housing_num.median().values

In [None]:
#Handling Text and Categorical Attributes 
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing_df["ocean_proximity"]
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_cat_encoded
print(encoder.classes_)

In [None]:
#One hot encoding for categorical variables
from sklearn.preprocessing import OneHotEncoder
encoder= OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot=housing_cat_1hot.toarray()
housing_cat_1hot

In [None]:
#Implementation of a data pipeline to perform transformations sequentially 
#Custom transformations 
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix,bedrooms_ix, population_ix,household_ix=3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c[X,rooms_per_household,population_per_household]
#Pipeline implementation 
attr_adder=CombinedAttributesAdder(add_bedrooms_per_room=False)

# Pipeline Implementation

In [9]:
"""
The numeric pipeline executes all the previously defined transformations in a sequential maner 
Selector - Converts a dataframe into a series by selecting key by key 
Imputer- Fills values according to predifined parameter
Attribs_adder is the custom transformation class to estimate parameters from the existing data
StandardScaler , standardizes the  the data
"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
#Custom transformer that selects desired attributes and drops the rest converting the DataFrame to a 
#NumPy array 
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values
rooms_ix,bedrooms_ix, population_ix,household_ix=3,4,5,6
class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        rooms_per_household=X[:,rooms_ix]/X[:,household_ix]
        population_per_household=X[:,population_ix]/X[:,household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
            return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c[X,rooms_per_household,population_per_household]
#Custom LabelSerializer class to overcome version bug 
class MyLabelBinarizer(TransformerMixin):
    def __init__(self,*args,**kwargs):
        self.encoder=LabelBinarizer(*args,**kwargs)
    def fit(self,x,y=0):
        self.encoder.fit(x)
        return self
    def transform(self,x,y=0):
        return self.encoder.transform(x)

attr_adder=CombinedAttributesAdder(add_bedrooms_per_room=False)
num_attributes=list(housing_num)
cat_attributes=["ocean_proximity"]
num_pipeline=Pipeline([
    ('selector',DataFrameSelector(num_attributes)),
    ('imputer',Imputer(strategy="median")),
    ('attribs_adder',CombinedAttributesAdder()),
    ('std_scaler',StandardScaler()),
])
"""
Categorical pipeline applies the LaberBinarizer which applies 1hot encoding to the categorical variables
"""
cat_pipeline=Pipeline([
    ("selector",DataFrameSelector(cat_attributes)),
    ('LabelBinarizer',MyLabelBinarizer()),
])

#Combined pipeline
full_pipeline=FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline),
])
train_set.head()
housing_prepared=full_pipeline.fit_transform(train_set)
housing_prepared.shape

(16512, 17)

# Model Implementation 

In [23]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)

LinearRegression()

In [26]:
#Model Validation 
some_data= housing_df.iloc[:5]
some_labels=housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)#Runs the transformation on the selected data
print("Predictions:\t",lin_reg.predict(some_data_prepared))
print("Labels:\t\t",list(some_labels))


Predictions:	 [205992.58045701 211149.8202649  204215.64311517 205603.48371149
 203744.80439842]
Labels:		 [286600, 340600, 196900, 46300, 254500]


In [27]:
#Model Evaluation 
from sklearn.metrics import mean_squared_error
housing_predictions =lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_labels,housing_predictions)
lin_rmse=np.sqrt(lin_mse)
lin_rmse

115629.83217387853

A more powerful model is implemented in order to improve the accuracy of the predictions. In general it is better to try an enhance the predictor quality rather than increasing the complexity of the algorithm 

In [30]:
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)


DecisionTreeRegressor()

In [35]:
#Model evaluation 
housing_predictions=tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_labels,housing_predictions)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

0.0

The absence of error suggests the model has badly overfitted the data so , a cross validation procedure will be implemented. The cross validation will further split the training data to use a portion of it for model validation 

In [36]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-scores)

In [37]:
def display_scores(scores):
    print("Scores: ",scores)
    print("Mean:",scores.mean())
    print("Standard deviation: ",scores.std())
    

In [38]:
display_scores(rmse_scores)

Scores:  [163999.47370024 165689.07874822 166412.71474574 171652.63066097
 167724.16611095 167579.71255299 174419.8622819  167103.93498818
 166087.97426183 166989.54470216]
Mean: 167765.90927531756
Standard deviation:  2891.4515504479136


The results of the cross validation indicate that the data is ,in fact, being overfitted , a second run of the linear model indicates that it has a better performance than the decision tree regressor

In [39]:
lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=18)
lin_rmse_scores=np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores:  [114578.22377601 106763.49230455 118607.9496812  114090.16788428
 116346.01056322 115613.6577395  115880.9484492  119305.74412959
 113817.72437643 119671.38331522 115567.41408669 114934.88779225
 116142.25585424 116160.57116635 120869.09084229 113837.03012224
 116802.16653555 113769.31971082]
Mean: 115708.77990720146
Standard deviation:  2991.495405353437


A new model is tried , a random forest regressor, by ensemble learning 

In [43]:
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_reg.fit(housing_prepared,housing_labels)
forest_scores=cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=18)


  forest_rmse=np.sqrt(forest_scores)


array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan])

In [44]:
forest_rmse=np.sqrt(-forest_scores)
forest_rmse

array([117451.90085111, 109168.39811208, 119883.11534572, 116825.80651889,
       119862.74489969, 117980.72839661, 118745.22933382, 121289.26975856,
       117225.4492176 , 121344.3813781 , 118457.90872732, 118821.58581348,
       117390.90955896, 117796.99254541, 122340.0032819 , 116604.80310776,
       118818.43104326, 115786.24731127])

In [45]:
display_scores(forest_rmse)

Scores:  [117451.90085111 109168.39811208 119883.11534572 116825.80651889
 119862.74489969 117980.72839661 118745.22933382 121289.26975856
 117225.4492176  121344.3813781  118457.90872732 118821.58581348
 117390.90955896 117796.99254541 122340.0032819  116604.80310776
 118818.43104326 115786.24731127]
Mean: 118099.66140008534
Standard deviation:  2762.074132396007
