In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

datapath = Path() / "data" / "housing"
housing = pd.read_csv(datapath / "housing.csv")

housing["ocean_proximity"].value_counts()
# housing.describe()

In [None]:
housing.hist(bins=50, figsize=(12, 8))
# plt.show()                                                                               

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
housing["income_cat"]= pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])
housing["income_cat"].value_counts()

In [None]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Kategoria dochodów")
plt.ylabel("Liczbadystryktów")
plt.show()

In [None]:
df_train_rand_samp, df_test_rand_samp = train_test_split(
    housing, test_size=0.2, random_state=42)
df_train, df_test = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42)

In [None]:

def income_cat_proportions(data):
    return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall %": income_cat_proportions(housing),
    "Stratified %": income_cat_proportions(df_test),
    "Random %": income_cat_proportions(df_test_rand_samp),
}).sort_index()
compare_props.index.name = "Income Category"
compare_props["Strat. Error %"] = (compare_props["Stratified %"] /
                                   compare_props["Overall %"] - 1)
compare_props["Rand. Error %"] = (compare_props["Random %"] /
                                  compare_props["Overall %"] - 1)
(compare_props * 100).round(2)

In [None]:
# for set_ in (df_train, df_test):
#     set_.drop("income_cat", axis=1, inplace=True)
df_train_eda=df_train.copy()
df_train_eda.plot(kind="scatter", x="longitude", y="latitude", grid=True)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True,
             s=housing["population"] / 100, label="population",
             c="median_house_value", cmap="jet", colorbar=True,
             legend=True, sharex=False, figsize=(10, 7))
# save_fig("housing_prices_scatterplot")  # extra code
plt.show()

In [None]:
corr_matrix = df_train_eda.select_dtypes(np.number).corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
corr_matrix

In [None]:
sns.heatmap(corr_matrix, annot=True, annot_kws={"size": 8}, cmap='coolwarm', vmin=-1)
plt.show()

In [None]:
#Spearman correlation
corr_matrix_spearman = df_train_eda.select_dtypes(np.number).corr(method="spearman")
sns.heatmap(corr_matrix_spearman, annot=True, annot_kws={"size": 8}, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

plt.show()

In [None]:
df_train_eda.plot(kind="scatter", x="median_income", y="median_house_value",
             alpha=0.1, grid=True)

plt.show()

In [None]:
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["people_per_house"] = housing["population"] / housing["households"]

In [None]:
corr_matrix = df_train_eda.select_dtypes(np.number).corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
X_train = df_train.drop("median_house_value", axis=1)
y_train = df_train["median_house_value"].copy()

In [None]:
df_train.isna().sum()

In [None]:
null_rows_idx = X_train.isnull().any(axis=1)
X_train.loc[null_rows_idx].head()

In [None]:
X_train_option1 = X_train.copy()
X_train_option1.dropna(subset=["total_bedrooms"], inplace=True)
X_train_option1.loc[null_rows_idx].head()

In [None]:
X_train_option2 = X_train.copy()
X_train_option2.drop("total_bedrooms", axis=1, inplace=True)
X_train_option2.loc[null_rows_idx].head()