In [None]:
import pandas as pd
url="https://raw.githubusercontent.com/ageron/data/main/housing/housing.csv"
housing=pd.read_csv(url)
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Pairplot with hue based on ocean proximity
sns.pairplot(housing, vars=["housing_median_age", "median_income", "median_house_value"], hue="ocean_proximity")
plt.suptitle("Pairplot of Selected Features by Ocean Proximity", y=1.02)
plt.show()

In [None]:
import seaborn as sns
target = "median_house_value"
for col in numeric_features:
    plt.figure(figsize=(10, 5))
    plt.scatter(x=housing.index, y=housing[col], alpha=0.2, c='orange', edgecolor='k')
    plt.title(f"{col} Index vs Value")
    plt.xlabel("Index")
    plt.ylabel(col)
    plt.grid(True)
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Joint plot of total rooms and total bedrooms
sns.jointplot(x="total_rooms", y="total_bedrooms", data=housing, kind="reg")
plt.suptitle("Joint Plot of Total Rooms and Total Bedrooms", y=1.02)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count plot of ocean_proximity
plt.figure(figsize=(10, 6))
sns.countplot(x="ocean_proximity", data=housing)
plt.title("Count of Houses by Ocean Proximity")
plt.xlabel("Ocean Proximity")
plt.ylabel("Count")
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot of population vs households
plt.figure(figsize=(10, 6))
sns.scatterplot(x="population", y="households", data=housing, alpha=0.3)
plt.title("Population vs Households")
plt.xlabel("Population")
plt.ylabel("Households")
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ridgeline plot of median income by ocean proximity
# You might need to install joypy for this plot: pip install joypy
# import joypy as jp
# fig, axes = jp.joyplot(housing, by="ocean_proximity", column="median_income", figsize=(10, 6), alpha=0.5)
# plt.title("Ridgeline Plot of Median Income by Ocean Proximity")
# plt.show()

# Alternative: FacetGrid with density plots
g = sns.FacetGrid(housing, row="ocean_proximity", height=2, aspect=4)
g.map(sns.kdeplot, "median_income", fill=True)
g.fig.suptitle("Density Plot of Median Income by Ocean Proximity", y=1.02)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# 1. Define features
numeric_features = housing.select_dtypes(include="number").columns.drop(["latitude", "longitude","median_house_value"])
categorical_features = housing.select_dtypes(include="object").columns

# 2. Manual encoder: build maps for each categorical column
value_to_int_maps = {
    col: {val: i for i, val in enumerate(housing[col].dropna().unique())}
    for col in categorical_features
}

# 3. Replace categories with integers using mapping
housing_encoded = housing.copy()
for col in categorical_features:
    housing_encoded[col] = housing[col].map(value_to_int_maps[col])

# 4. Build transformers
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
    # No encoder here — already manually encoded
])

# 5. ColumnTransformer (now skips encoding)
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])
X_preprocessed = preprocessor.fit_transform(housing_encoded)
X_preprocessed=pd.DataFrame(X_preprocessed,columns=numeric_features.tolist()+categorical_features.tolist())

In [None]:
X_preprocessed.info()

In [None]:
# 7. Column names: no explosion now
all_columns = list(numeric_features) + list(categorical_features)

# 8. Final DataFrame
data = pd.DataFrame(X_preprocessed, columns=all_columns)

# 9. Add latitude and longitude
lat_long = housing[["latitude", "longitude","median_house_value"]].reset_index(drop=True)
data = pd.concat([data.reset_index(drop=True), lat_long], axis=1)

# 10. Preview
# for col in categorical_features:
#     data[col] = data[col].round().astype(int).map(int_to_value_maps[col])

print(data.head())


In [None]:
# Define target
target_column = "median_house_value"

# X = all columns except the target
X = data.drop(columns=[target_column])

# y = target column
y = data[target_column]
