## Importing Dependancies

In [1]:
import os

import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

  if entities is not ():


## Loading our CSV

In [2]:
final_housing_df = pd.read_csv(
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/02-preprocessed/preprocessed.csv"
)
final_housing_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/02-preprocessed/preprocessed.csv'

## Train, Test and Validation Split

In [None]:
y = final_housing_df["median_house_value"]
X = final_housing_df.drop("median_house_value", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

## Preprocessing to change the scale of the data

In [None]:
X_train.head()

In [None]:
numeric_col = X_train.columns[:8]

scaler = StandardScaler()
scaler.fit(X_train[numeric_col])


def preprocessor(X):
    X_copy = X.copy()
    X_copy[numeric_col] = scaler.transform(X_copy[numeric_col])
    return X_copy


X_train_pre, X_test_pre = preprocessor(X_train), preprocessor(X_test)

In [None]:
pd.DataFrame(X_train_pre)

In [None]:
pd.DataFrame(X_train_pre).hist()

In [None]:
X_train_pre.shape, X_test_pre.shape

## Saving the train and Test dataframes in the 03-features data folder

In [None]:
# Define folder
folder_path = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/03-features"
)

# Make folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Define filenames
train_file = os.path.join(folder_path, "train_preprocessed.csv")
test_file = os.path.join(folder_path, "test_preprocessed.csv")

# Save preprocessed DataFrames
X_train_pre.to_csv(train_file, index=False)
X_test_pre.to_csv(test_file, index=False)

# Optional: print confirmation
print("Train and test sets saved successfully!")

## LinearRegression

In [None]:
# Initialize and fit model
lm = LinearRegression()
lm.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = lm.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

## k-fold for Linear Regression

#### Step 1 Define RMSE Scorer

In [None]:
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

#### Step 2 Create K-Fold splitter

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#### Step 3 Evaluate model with CV

In [None]:
lm = LinearRegression()
cv_scores = cross_val_score(lm, X_train_pre, y_train, cv=kf, scoring=rmse_scorer)

print("RMSE for each fold:", cv_scores)
print("Mean RMSE:", cv_scores.mean())
print("Std RMSE:", cv_scores.std())

## K-Nearest Neighbor

In [None]:
# Initialize and fit model
knn = KNeighborsRegressor(n_neighbors=15)
knn.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = knn.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Running cross validation

In [None]:
# use the string shortcut
knn_scores = cross_val_score(knn, X_train_pre, y_train, scoring=rmse_scorer, cv=10)

knn_rmse_scores = -knn_scores  # Flip the sign to make it positive

print("KNN Cross-Validation Mean RMSE:", knn_rmse_scores.mean())
print("KNN Cross-Validation Std:", knn_rmse_scores.std())

#### run a GridSearchCV to automatically find the best number of neighbors

In [None]:
# 1. Define the parameter values we want to try
param_grid = [{"n_neighbors": [2, 5, 10, 25, 35], "weights": ["uniform", "distance"]}]

# 2. Set up the search
grid_search = GridSearchCV(
    KNeighborsRegressor(), param_grid, cv=5, scoring=rmse_scorer, return_train_score=True
)

# 3. Fit the search (this will take a moment)
grid_search.fit(X_train_pre, y_train)

# 4. Get the results
print("Best Params:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

## Random Forest Regressor

In [None]:
# Initialize and fit model
rfr = RandomForestRegressor(max_depth=6)
rfr.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = rfr.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Running cross validation

In [None]:
# 1. Check the "Real" performance with Cross Validation
rfr_scores = cross_val_score(
    rfr, X_train_pre, y_train, scoring="neg_root_mean_squared_error", cv=10
)

rfr_rmse_scores = -rfr_scores  # Flip sign to positive

print("Random Forest Cross-Val Mean RMSE:", rfr_rmse_scores.mean())
print("Random Forest Cross-Val Std:", rfr_rmse_scores.std())
print("Gap (Overfitting):", rfr_rmse_scores.mean() - rmse_train)

#### run a GridSearchCV to automatically find the best number of max depth

In [None]:
# 1. Define the parameters to test
# We test specific limits [5, 10, 20] and 'None' (unlimited depth)
param_grid = [
    {
        "max_depth": [5, 10, 15, 20, 30, None],
        "n_estimators": [100],
    }  # Keeping estimators constant for now
]

# 2. Set up the search
# n_jobs=-1 uses all your CPU cores to speed up calculation
forest_grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    return_train_score=True,
    n_jobs=-1,
)

# 3. Fit the search
forest_grid_search.fit(X_train_pre, y_train)

# 4. Results
print("Best Max Depth:", forest_grid_search.best_params_["max_depth"])
print("Best Cross-Val RMSE:", -forest_grid_search.best_score_)

## Gradient Boosting Regressor

In [None]:
gbr = GradientBoostingRegressor(n_estimators=30)
gbr.fit(X_train_pre, y_train)

# Predict on training set
y_pred_train = gbr.predict(X_train_pre)

# Compute RMSE manually
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Train RMSE:", rmse_train)

#### Run with cross validation

In [None]:
# 1. Check the "Real" performance with Cross Validation
gbr_scores = cross_val_score(gbr, X_train_pre, y_train, scoring=rmse_scorer, cv=10)

gbr_rmse_scores = -gbr_scores  # Flip sign to positive

print("Gradient Boosting Cross-Val Mean RMSE:", gbr_rmse_scores.mean())
print("Gradient Boosting Cross-Val Std:", gbr_rmse_scores.std())
print("Gap (Overfitting):", gbr_rmse_scores.mean() - rmse_train)

#### Grid Search

In [None]:
# 1. Define the grid
# We want to see if 'Slow & Steady' (0.01 + 300) beats 'Fast & Aggressive' (0.3 + 30)
param_grid = [
    {
        "n_estimators": [30, 100, 300, 500],
        "learning_rate": [0.01, 0.1, 0.3],
        "max_depth": [3],  # Standard default for Boosting is shallow trees (3)
    }
]

# 2. Set up the search
gb_grid_search = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1,  # Use all cores
)

# 3. Fit the search
print("Running Grid Search... (this may take a minute)")
gb_grid_search.fit(X_train_pre, y_train)

# 4. Results
print("\n--- Results ---")
print("Best Params:", gb_grid_search.best_params_)
print("Best Cross-Val RMSE:", -gb_grid_search.best_score_)

## Neural Network

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

simple_nn = Sequential()
# Ensure input_shape matches your data (13 features)
simple_nn.add(InputLayer(input_shape=(13,)))
simple_nn.add(Dense(2, activation="relu"))
simple_nn.add(Dense(1, activation="linear"))

opt = Adam(learning_rate=0.1)

# FIX: Added .keras extension here
cp = ModelCheckpoint("models/simple_nn.keras", save_best_only=True)

simple_nn.compile(optimizer=opt, loss="mse", metrics=[RootMeanSquaredError()])
simple_nn.fit(x=X_train_pre, y=y_train, callbacks=[cp], epochs=100)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import load_model

# 1. FIX: Update the filename to match the .keras extension you saved earlier
simple_nn = load_model("models/simple_nn.keras")

# 2. FIX: Calculate RMSE manually since 'squared=False' was crashing earlier
# We use np.sqrt() to convert MSE to RMSE
train_pred = simple_nn.predict(X_train_pre)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

print(f"Train RMSE: {train_rmse}")

In [None]:
medium_nn = Sequential()
medium_nn.add(InputLayer(shape=(13,)))  # Explicit shape is safer
medium_nn.add(Dense(32, activation="relu"))
medium_nn.add(Dense(16, activation="relu"))
medium_nn.add(Dense(1, activation="linear"))

# Note: learning_rate=.1 is very high for NNs.
# If training is unstable, try changing this to 0.01 or 0.001
opt = Adam(learning_rate=0.1)

# FIX 1: Added .keras extension
cp = ModelCheckpoint("models/medium_nn.keras", save_best_only=True)

medium_nn.compile(optimizer=opt, loss="mse", metrics=[RootMeanSquaredError()])

# FIX 2: Switched to validation_split since you don't have X_val/y_val
medium_nn.fit(
    x=X_train,
    y=y_train,
    validation_split=0.2,  # Automatically uses 20% of training data for validation
    callbacks=[cp],
    epochs=100,
)

In [None]:
# 1. FIX: Load the correct file format
medium_nn = load_model("models/medium_nn.keras")

# 2. Predict on Training data
train_predictions = medium_nn.predict(X_train)

# 3. FIX: Calculate RMSE manually (Works on all versions)
rmse_train = np.sqrt(mean_squared_error(y_train, train_predictions))

print(f"Train RMSE: {rmse_train}")

In [None]:
large_nn = Sequential()
large_nn.add(InputLayer(shape=(13,)))
large_nn.add(Dense(256, activation="relu"))
large_nn.add(Dense(128, activation="relu"))
large_nn.add(Dense(64, activation="relu"))
large_nn.add(Dense(32, activation="relu"))
large_nn.add(Dense(1, activation="linear"))

# CHANGE: Lowered learning rate from 0.1 to 0.001
# 0.1 is usually too aggressive for a network this deep and will break training.
opt = Adam(learning_rate=0.001)

# FIX 1: Added .keras extension
cp = ModelCheckpoint("models/large_nn.keras", save_best_only=True)

large_nn.compile(optimizer=opt, loss="mse", metrics=[RootMeanSquaredError()])

# FIX 2: Switched to validation_split
large_nn.fit(x=X_train_pre, y=y_train, validation_split=0.2, callbacks=[cp], epochs=100)

In [None]:
# 1. FIX: Load with the correct extension
large_nn = load_model("models/large_nn.keras")

# 2. Predict on Training Data
train_pred = large_nn.predict(X_train_pre)

# 3. FIX: Calculate RMSE manually
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))

print(f"Train RMSE: {train_rmse}")

# Note: You cannot run the second part of your code because 'X_val' is undefined.
# To see validation performance, check the 'val_root_mean_squared_error'
# printed in the last epoch of your .fit() output.

## Gradient Boosting Test Data

In [None]:
# Predict on the test set
y_pred_test = gbr.predict(X_test_pre)

# Calculate RMSE manually (Universal fix)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("Test RMSE:", test_rmse)

## Large neural network on test data

In [None]:
# Predict on the test set
y_pred_test = large_nn.predict(X_test_pre)

# Calculate RMSE manually (Universal fix)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("Test RMSE:", test_rmse)

## Generate Predictions

In [None]:
# 1. Setup the output path
output_folder = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/04-predictions"
)
os.makedirs(output_folder, exist_ok=True)  # Creates the folder if it doesn't exist

# 2. Load your winning model
# (Ensuring we use the correct .keras format)
large_nn = load_model("models/large_nn.keras")

# 3. Generate Predictions on the Test Set
print("Generating predictions...")
predictions = large_nn.predict(X_test_pre).flatten()  # flatten() converts shape (N,1) to (N,)

# 4. Create a DataFrame to organize the results
results_df = pd.DataFrame({"Actual_Price": y_test, "Predicted_Price": predictions})

# 5. Add an 'Error' column (Difference)
results_df["Error"] = results_df["Actual_Price"] - results_df["Predicted_Price"]

# 6. Save to CSV
output_path = os.path.join(output_folder, "large_nn_predictions.csv")
results_df.to_csv(output_path, index=False)

print(f"Success! Predictions saved to:\n{output_path}")
print("\nFirst 5 rows of the output:")
print(results_df.head())

## Predictions to submit to kaggle

In [None]:
# 1. Setup path
output_folder = (
    "/home/kobey/Documents/DATASCIENCE/PROJECTS/CALIFORNIA HOUSING PRICES/data/04-predictions"
)
os.makedirs(output_folder, exist_ok=True)

# 2. Load Model
large_nn = load_model("models/large_nn.keras")

# 3. Generate Predictions
print("Generating predictions...")
predictions = large_nn.predict(X_test).flatten()

# 4. Create Kaggle-Format DataFrame
# We try to use the original index from X_test if available (standard for Pandas)
# If X_test is a numpy array, we generate IDs starting from 0 or 1
if hasattr(X_test, "index"):
    ids = X_test.index
else:
    ids = range(1, len(predictions) + 1)

submission_df = pd.DataFrame({"Id": ids, "median_house_value": predictions})

# 5. Save
output_path = os.path.join(output_folder, "submission_large_nn.csv")
submission_df.to_csv(output_path, index=False)

print(f"Kaggle-ready submission saved to:\n{output_path}")
print("\nPreview:")
print(submission_df.head())