In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
housing = pd.read_csv("../input/california-housing-prices/housing.csv")

In [None]:
housing.head()

All the data are numerical values except ocean_proximity, which is categorical feature. 

In [None]:
housing.info()

We can see that only total_bedrooms have non-null values

In [None]:
housing["ocean_proximity"].value_counts()

As ocean_proximity is a categorical feature, we interpret the count of each category.
We can see that inland housing is the highest and island housing is the lowest. Which makes sense, as most of the people live in inland areas and only the wealthy lives on islands.

In [None]:
housing.describe()

* We can see that 25% of the houses have less than 18 years median age
* 50% of the houses have less than 29 years median age
* 75% of the house have less than 37 years median age

This tells us that most of the houses have median ages more than 18 years.

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

From the above histograms we can see that:
* Most of the features are heavy on the tails, so we have to scale it to make it bell-shaped.
* The median income is not expresses in US dollars but are in smaller units. The median income are changed into smaller units by the data collector. Like 4 means 40000$
* housing_median_age also looks like that is also capped.
* All the features have very different scales

In [None]:
from sklearn.model_selection import train_test_split
train_set, teat_set = train_test_split(housing, test_size = 0.2, random_state = 42)

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
 bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
 labels=[1, 2, 3, 4, 5])


We do stratfication of income_cat to avoid bias

In [None]:
housing["income_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
 strat_train_set = housing.loc[train_index]
 strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)


In [None]:
for set_ in (strat_train_set, strat_test_set):
 set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude")


This is distribution of houses around California, but we cannot deduce which area has more houses. Lets do a little tweak.

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)


Now that’s much better: you can clearly see the high-density areas, namely the Bay
Area and around Los Angeles and San Diego, plus a long line of fairly high density in
the Central Valley, in particular around Sacramento and Fresno.

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
 s=housing["population"]/100, label="population", figsize=(10,7),
 c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()


The plot clearly shows that the significantly house price increases according to the position, as most of the red and yellow patches are near to the ocean.

In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending = False)

from the correlation matrix, we can see that median house value is positively and strongly correlated with median_income and negatively correlated with latitude, which means the slightly decreases when we go north.

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
 "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))


We can see that most of the plots have very little correlation except median_income, so we'll explore it furthur

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",
 alpha=0.1)


The plot shows strong correlation, but still there some inconsistancy in the plot. There are horizontal lines on 50K, 35K and some more downwards. This will effect the efficiency of the model, so we'll try to deal with this problem.

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]


I made custom attributes because some of the attributes did not make sense and were vague
Like total_rooms of the whole district does not help much, so dividing it by the housholds will give the average rooms in each housholds in the district
and bedroom_per_rooms and population_per_households are made to make sense out of the data 


In [None]:
corr_matrix = housing.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

So the custom attribute workes! bedrooms_per_room is negatively correlated with the mediean_house_value. This means that the less the bedroom/room ratio in a house the more the expensive. Interesting!

#                                     #                            Data Cleaning
                                               

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

Lets first drop the predictor attribute before cleaning.

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

We use imputer to fill out the missing values

In [None]:
housing_num = housing.drop('ocean_proximity',axis=1)

dropping the non numerical attribute, as we can find median of only numerical values

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
X = imputer.transform(housing_num)

In [None]:
X

In [None]:
housing_tr = pd.DataFrame(X, columns=housing_num.columns)

changing it back to dataframes

In [None]:
housing_tr

In [None]:
housing_tr.info()

In [None]:
housing_cat = housing[['ocean_proximity']]

In [None]:
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()


In [None]:

housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

In [None]:
housing_cat_encoded[:10]

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat) 
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()