In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
## numerical_attributes -> numerical categories of X_train

In [None]:
housing = pd.read_csv(r"C:\Users\Ganesh P\Desktop\JUPYTER LAB\housing.csv")
housing 

In [None]:
y = housing["median_house_value"]

## Stratified Shuffle Splitting data for equal distribution of data in test and train splits 

In [None]:
attributes = housing.columns
attributes
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins = [0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1,2,3,4,5]
)

In [None]:
X = housing.drop("median_house_value", axis=1)

In [None]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index , test_index in split.split(housing,housing["income_cat"]):
    train_set = housing.loc[train_index]
    test_set = housing.loc[test_index]

In [None]:
for i in (train_set, test_set):
    i.drop(["income_cat"], axis=1, inplace=True)

In [None]:
X_train = train_set.drop("median_house_value", axis=1)
Y_train = train_set["median_house_value"].copy()
X_test = test_set.drop("median_house_value", axis=1)
Y_test = test_set["median_house_value"].copy()

## Exploring and Vizualizing the Data

In [None]:
X_train

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(X_train["housing_median_age"], bins=25, alpha=1, kde=True)
plt.title("Distribution of median house age")

In [None]:
X_train.hist(bins=50, figsize=(10,8))

In [None]:
corr_set = train_set.drop("ocean_proximity", axis=1)

In [None]:
corr_matrix = corr_set.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=True)

In [None]:
plt.figure(figsize=(10,8))
train_set.plot(kind="scatter",x="latitude", y="longitude", grid=True, c="median_house_value",cmap="magma", colorbar=True, legend=True, alpha=0.2)
plt.grid(True)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(
    corr_matrix,
    annot = True,
    fmt=".2f",
    cmap= "coolwarm",
    cbar=True,
    alpha=1
)

## Imputation part

In [None]:
simpleimputer = SimpleImputer(strategy="median")

In [None]:
X_train_num = X_train.select_dtypes(include=[np.number])
X_test_num = X_test.select_dtypes(include=[np.number])

In [None]:
simpleimputer.fit(X_train_num)
# fit the data for train_set and transform both train and test set

In [None]:
X_train_imputed = simpleimputer.transform(X_train_num)

In [None]:
X_test_imputed = simpleimputer.transform(X_test_num)

In [None]:
X_test_imputed_df = pd.DataFrame(
    X_test_imputed,
    columns=X_test_num.columns,
    index = X_test_num.index
)

In [None]:
X_test.loc[:,X_test_imputed_df.columns]= X_test_imputed_df

In [None]:
X_test.info()

In [None]:
X_train_imputed.shape

In [None]:
X_train_imputed_df = pd.DataFrame(
    X_train_imputed,
    columns= X_train_num.columns,
    index= X_train_num.index
)

In [None]:
X_train.loc[:,X_train_imputed_df.columns] = X_train_imputed_df

## Feature Engineering

#### creating a column for nearest costal city distance and nearest major city distance

In [None]:
coastal_points = [
    (42.0 , -124.4),  # Near Oregon border
    (38.0, -123.0),   # Northern California
    (36.6, -121.9),   # Central Californai
    (34.0, -118.5),   # Los Angeles Area
    (32.5, -117.0)    # San Diego
]
city_points = [
    (34.0522, -118.2437),
    (37.7749, -122.4194),
    (32.7157, -117.1611),
    (38.5816, -121.4944),
    (37.3382, -121.8863)
]

def distance_to_coast(lat, lon, coastal_points):
    distances = [np.sqrt((lat - C_lat)**2 + (lon - C_lon)**2) for C_lat, C_lon in coastal_points]
    return np.min(distances)
def distance_to_majorcities(lat, lon, city_points):
    distances = [np.sqrt((lat - Ci_lat)**2 + (lon - Ci_lon)**2) for Ci_lat, Ci_lon in city_points]
    return np.min(distances)

X_train["dist_to_coast"] = X_train.apply(lambda row: distance_to_coast(row["latitude"], row["longitude"], coastal_points), axis=1)
X_train["dist_to_city"] = X_train.apply(lambda a: distance_to_majorcities(a["latitude"], a["longitude"], city_points), axis=1)
X_test["dist_to_coast"] = X_test.apply(lambda row: distance_to_coast(row["latitude"], row["longitude"], coastal_points), axis=1)
X_test["dist_to_city"] = X_test.apply(lambda a: distance_to_majorcities(a["latitude"], a["longitude"], city_points), axis=1)

### log transforming catrgories that all Right Skewed

### we are going with yeo-johnson tranformation since log transformation cant handle negatives

In [None]:
numerical_cat = X_train.drop("ocean_proximity", axis=1).columns

In [None]:
skewness = X_train[numerical_cat].skew()
right_skewed = skewness[skewness>0.5].index.tolist()

In [None]:
pt = PowerTransformer(method="yeo-johnson", standardize=False)
pt.fit(X_train[right_skewed])

In [None]:
X_train_pt =pd.DataFrame(pt.transform(X_train[right_skewed]), columns=right_skewed, index=X_train.index)
X_test_pt = pd.DataFrame(pt.transform(X_test[right_skewed]), columns=right_skewed, index=X_test.index)

In [None]:
X_train[right_skewed] = X_train_pt
X_test[right_skewed] = X_test_pt

## One-Hot Encoding for Categorical Variables

In [None]:
housing_cat = housing["ocean_proximity"]
housing_cat_X_train = X_train[["ocean_proximity"]]
housing_cat_X_test = X_test[["ocean_proximity"]]

In [None]:
housing_cat_X_train.value_counts().index

In [None]:
onehotencoder = OneHotEncoder(handle_unknown="ignore")
onehotencoder.fit(housing_cat_X_train)
housing_cat_X_train = onehotencoder.transform(housing_cat_X_train)
housing_cat_X_test = onehotencoder.transform(housing_cat_X_test)

In [None]:
housing_cat_X_train = housing_cat_X_train.toarray()
housing_cat_X_train.shape

In [None]:
housing_cat_X_test = housing_cat_X_test.toarray()
housing_cat_X_test

In [None]:
onehotencoder.feature_names_in_

In [None]:
feature_names = onehotencoder.get_feature_names_out(input_features=["ocean_proximity"])

In [None]:
housing_cat_X_train_df = pd.DataFrame(housing_cat_X_train, columns=feature_names, index=X_train.index)
housing_cat_X_test_df = pd.DataFrame(housing_cat_X_test, columns=feature_names, index=X_test.index)

In [None]:
X_train = X_train.join(housing_cat_X_train_df)

In [None]:
X_test = X_test.join(housing_cat_X_test_df)

In [None]:
X_test.drop(columns=["ocean_proximity"], inplace=True)

In [None]:
X_train.drop(columns=["ocean_proximity"], inplace=True)

## Scaling the data

In [None]:
standardscaler = StandardScaler()

In [None]:
feature_names

In [None]:
X_train_ss = X_train.drop(list(feature_names),axis=1)
# gives out only Numerical Categories

In [None]:
X_test_ss = X_test.drop(list(feature_names),axis=1)

In [None]:
standardscaler.fit(X_train_ss)

In [None]:
X_test_ss_tf = standardscaler.transform(X_test_ss)
X_train_ss_tf = standardscaler.transform(X_train_ss)

In [None]:
X_test_ss = pd.DataFrame(X_test_ss_tf, columns = X_test_ss.columns, index=X_test_ss.index)

In [None]:
X_train_ss = pd.DataFrame(X_train_ss_tf, columns = X_train_ss.columns, index=X_train_ss.index)

In [None]:
X_train[X_train_ss.columns] = X_train_ss[X_train_ss.columns]

In [None]:
X_test[X_test_ss.columns] = X_test_ss[X_test_ss.columns]

In [None]:
X_train

In [None]:
X_train.hist(bins=50, figsize=(10,8))

In [None]:
sns.histplot(X_train["dist_to_coast"],kde=True)

In [None]:
sns.histplot(X_train[["dist_to_city"]], kde=True)

In [None]:
sns.histplot(X_train["housing_median_age"], kde=True, bins=50)

## Transforming target variable

In [None]:
sns.histplot(Y_train, bins=50, kde=True)

In [None]:
Y_train_log = np.log1p(Y_train)
Y_test_log = np.log1p(Y_test)

In [None]:
sns.histplot(Y_train, bins=50, kde=True)

## Performing Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,Y_train_log)

In [None]:
y_pred_log = lin_reg.predict(X_test)
y_pred = np.expm1(y_pred_log)

## Evaluate the Performancefor both log_space and RMSE in dollars

In [None]:
rmse = np.sqrt(mean_squared_error(np.expm1(Y_test_log), y_pred))
mae = mean_absolute_error(np.expm1(Y_test_log), y_pred)
print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}")

r2 = r2_score(Y_test_log, y_pred_log)
print(f"R2 (log-space) : {r2:.3f}")

## There is a huge error. instead of linear Regression use Decission Tree Regressor

In [None]:
tree_reg = DecisionTreeRegressor(max_depth=10, 
                                 min_samples_split=20,
                                 min_samples_leaf=10,
                                 random_state=42)
tree_reg.fit(X_train,Y_train_log)
y_pred_log = tree_reg.predict(X_test)
y_pred = np.expm1(y_pred_log)
rmse = np.sqrt(mean_squared_error(np.expm1(Y_test_log), y_pred))
mae = mean_absolute_error(np.expm1(Y_test_log), y_pred)
print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}")

r2 = r2_score(Y_test_log, y_pred_log)
print(f"R2 (log-space) : {r2:.3f}")

In [None]:
tree_rmses = -cross_val_score(
    tree_reg,
    np.expm1(X_train),
    np.expm1(Y_train_log),
    scoring="neg_root_mean_squared_error",
    cv=5
)
pd.Series(tree_rmses).describe()

## Fine-tuning the model

In [None]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}
scoring={
    "R2" : "r2",
    "MAE" : "neg_mean_absolute_error",
    "MSE" : "neg_mean_squared_error"
}

In [None]:
grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=5,refit="R2",scoring=scoring, n_jobs=-1)
grid.fit(X_train,np.expm1(Y_train_log))

print(grid.best_params_)
print(grid.best_score_)

In [None]:
results = grid.cv_results_

In [None]:
mean_test_RMSE = np.sqrt(-results["mean_test_MSE"])
mean_test_MAE = -results["mean_test_MAE"]

best_rmse_idx = mean_test_RMSE.argmin()
print("Best RMSE: " , mean_test_RMSE[best_rmse_idx])
print("parameters for best RMSE:", results["params"][best_rmse_idx])
best_mae_idx = mean_test_MAE.argmin()
print("Best MAE: ", mean_test_MAE[best_mae_idx])
print("parameters for best MAE:", results["params"][best_mae_idx])

## lets use Random Seach CV if it gives better parameters

In [None]:
param_dist = {
    "n_estimators": np.arange(50,500,50),
    "max_depth" : [None] + list(np.arange(5,30,5)),
    "min_samples_split" : np.arange(2,11),
    "min_samples_leaf" : np.arange(1,11),
    "max_features" : [None,"sqrt","log2"]
}
scoring={
    "R2" : "r2",
    "MAE" : "neg_mean_absolute_error",
    "MSE" : "neg_mean_squared_error"
}

In [None]:
random = RandomizedSearchCV(RandomForestRegressor(),param_dist,n_iter=50,
                           scoring=scoring,cv=5,n_jobs=-1,refit="R2")
random.fit(X_train,np.expm1(Y_train_log))
print(f"Best Paraeters: {random.best_params_}")
print(f"Best Score : {random.best_score_}")

In [None]:
results = random.cv_results_

In [None]:
mean_test_RMSE = np.sqrt(-results["mean_test_MSE"])
mean_test_MAE = -results["mean_test_MAE"]
RMSE_index = np.argmin(mean_test_RMSE)
print(f"Best RMSE : {mean_test_RMSE[RMSE_index]}") 
print(f"Best Parameters for Best RMSE : {results["params"][RMSE_index]}")
MAE_index = np.argmin(mean_test_MAE)
print(f"Best MAE : {mean_test_MAE[MAE_index]}") 
print(f"Best Parameters for Best MAE : {results["params"][MAE_index]}")

## Grid Search Cross-Validation gave better results compared to Randomized Search Cross-Validation

### but with just 1% difference it doesnt make any sense to use one over other.

In [None]:
y_pred = grid.best_estimator_.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(np.expm1(Y_test_log),(y_pred)))
print("Test RMSE:", rmse_test)

In [None]:
y_pred_rand = random.best_estimator_.predict(X_test)
rmse_test_rand = np.sqrt(mean_squared_error(np.expm1(Y_test_log),(y_pred_rand)))
print("Test RMSE:", rmse_test_rand)

## Final Vizualization

### Predicted vs Actual Values

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(Y_test,y_pred, alpha=0.6)
plt.plot([Y_test.min(), Y_test.max()],[Y_test.min(), Y_test.max()], 'r--')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

## Residual Plot

In [None]:
residuals = Y_test - y_pred
plt.scatter(y_pred,residuals,alpha=0.6)
plt.axhline(y=0, color="r", linestyle="--")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
plt.hist(residuals, bins=30, edgecolor='k', alpha=0.7)
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.title("Residual Distribution")
plt.show()