In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import pandas as pd
import numpy as np
import re
import pickle
#from data_processing import DataProcessor

In [26]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)

Scikit-learn version: 1.2.2


In [2]:
class DataProcessor:
    def __init__(self, load_encoder=False):
        self.load_encoder = load_encoder
        self.scaler = None
        self.encoder = None
        self.categorical_features = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']
        self.numerical_features = ['BHK', 'Size', 'Bathroom','CurrentFloor', 'TotalFloors']
        if load_encoder:
            self.load_encoders()
            
    def load_encoders(self):
        try:
            with open('../models/standard_scaler.pkl', 'rb') as f:
                self.scaler = pickle.load(f)
            with open('../models/onehot_encoder.pkl', 'rb') as f:
                self.encoder = pickle.load(f)
            print("Encoders loaded successfully.")
        except FileNotFoundError:
            print("Encoder files not found. Please fit and save them first.")
            
    def categorical_encoding(self, data):
        
        if self.encoder is None or self.load_encoder==False: 
            self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            self.encoder.fit(data[self.categorical_features])
            with open('../models/onehot_encoder.pkl', 'wb') as f:
                pickle.dump(self.encoder, f)
        # Print number of categories
        encoded_data = self.encoder.transform(data[self.categorical_features])
        return encoded_data

    def numerical_scaling(self, data):
        
        if self.scaler is None or self.load_encoder==False:
            self.scaler = StandardScaler()
            self.scaler.fit(data[self.numerical_features])
            with open('../models/standard_scaler.pkl', 'wb') as f:
                pickle.dump(self.scaler, f)
        
        scaled_data = self.scaler.transform(data[self.numerical_features])
        
        return scaled_data

    def remove_outliers_iqr(self, data, column, threshold=1.5):
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    
    def extract_floor_info(self,df):
        """
        Extracts current floor and total floors from the 'Floor' column,
        replaces non-numeric labels, converts to int, and drops the original 'Floor' column.
        """
        pattern = r'(?P<CurrentFloor>\w+)\s*out\s*of\s*(?P<TotalFloors>\d+)'
        df[['CurrentFloor', 'TotalFloors']] = df['Floor'].str.extract(pattern)
        floor_replacements = {
            'Ground': 0,
            'Basement': -1
        }

        # Replace na with the value of 'Floor' column
        df['CurrentFloor'] = df['CurrentFloor'].fillna(df['Floor'])
        df['TotalFloors'] = df['TotalFloors'].fillna(df['Floor'])

        df['CurrentFloor'] = df['CurrentFloor'].replace(floor_replacements)
        df['TotalFloors'] = df['TotalFloors'].replace(floor_replacements)

        df['CurrentFloor'] = pd.to_numeric(df['CurrentFloor'], errors='coerce').astype('Int64')
        df['TotalFloors'] = pd.to_numeric(df['TotalFloors'], errors='coerce').astype('Int64')

        df = df.drop(columns=['Floor'])
        return df
    
    def data_process_train(self, data_path):
        """Load and preprocess data"""
        # Load data
        data = pd.read_csv(data_path)
        data = self.remove_outliers_iqr(data, 'Rent')
        data = self.remove_outliers_iqr(data, 'Size')
        
        # Split data into features and target
        X = data.drop('Rent', axis=1)
        y = data['Rent']
        y = np.array(y)
        
        # Drop unnecessary columns
        unused_columns = ['Posted On','Area Locality']
        X.drop(columns=unused_columns, inplace=True, errors='ignore')
        
        # Handle floor data to 'Floor' and 'ofFloor'
        X = self.extract_floor_info(X)
        
        # Split categorical and numerical features
        categorical_data = self.categorical_encoding(X[self.categorical_features])
        numerical_data = self.numerical_scaling(X[self.numerical_features])
        
        X = np.concatenate((categorical_data, numerical_data), axis=1)           
        
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        return X_train, X_test, y_train, y_test
    
    def clean_input(self, input_data):
        # Convert to DataFrame if it's a dictionary
        if isinstance(input_data, dict):
            input_df = pd.DataFrame([input_data])
        else:
            input_df = input_data
        
        # Encode categorical features
        input_encoded = self.categorical_encoding(input_df[self.categorical_features])
        
        # Scale numerical features
        input_scaled = self.numerical_scaling(input_df[self.numerical_features])
        
        X = np.concatenate((input_encoded, input_scaled), axis=1)

        return X

In [3]:
preprocessor = DataProcessor()
# Load data
X_train, X_test, y_train, y_test = preprocessor.data_process_train('../data/House_Rent_Dataset.csv')


In [4]:
def train_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate the model
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    
    print(f"Train MSE: {mse_train:.2f}, R2: {r2_train:.2f}, MAE: {mae_train:.2f}")
    print(f"Test MSE: {mse_test:.2f}, R2: {r2_test:.2f}, MAE: {mae_test:.2f}")
    
    # Save the model
    model_path = f'../models/{model.__class__.__name__}.pkl'
    joblib.dump(model, model_path)
    
    return model,{ 
        'mse_train': mse_train,
        'mse_test': mse_test,
        'r2_train': r2_train,
        'r2_test': r2_test,
        'mae_train': mae_train,
        'mae_test': mae_test
    }

# Hyperparameter

In [6]:
import mlflow

In [11]:
mlflow.start_run()

<ActiveRun: >

In [7]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'file:///d:/year%204%20work/MachineLearningOPS/CPE393-group_name/app/mlruns'


In [12]:
mlflow.set_experiment(experiment_id="0")

mlflow.autolog()

2025/05/28 16:36:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


## Random Forest

In [13]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [14]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring = 'neg_mean_absolute_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  warn(
2025/05/28 16:39:48 INFO mlflow.sklearn.utils: Logging the 5 best runs, 95 runs will be omitted.


In [15]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [17]:
mlflow.end_run()

## Ridge

In [31]:
mlflow.start_run()

<ActiveRun: >

In [34]:
# Alpha values
alpha_val = [0.001,0.01,0.1,1.0]
# Create the random grid
random_grid = {'alpha': alpha_val}
print(random_grid)

{'alpha': [0.001, 0.01, 0.1, 1.0]}


In [35]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
ridge = Ridge()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
ridge_random = RandomizedSearchCV(estimator = ridge, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, scoring = 'neg_mean_absolute_error')
# Fit the random search model
ridge_random.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=1.0; total time=   0.0s
[CV] END ..........................................alpha=1.0; total time=   0.0s
[CV] END ........................................

2025/05/28 17:20:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


In [38]:
mlflow.end_run()

# Neural Network

In [21]:
import tensorflow as tf

from tensorflow import keras
from keras import models
from keras import layers

2025/05/28 16:59:55 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/05/28 16:59:55 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/05/28 16:59:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2025/05/28 16:59:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


In [50]:
import itertools

dnn_param_table = {
    'layer1_node': [64,128,256],
    'layer2_node': [64,128,256],
    'learning_rate': [0.0001,0.001,0.01]
}

param_combos = list(itertools.product(dnn_param_table['layer1_node'], dnn_param_table['layer2_node'], dnn_param_table['learning_rate']))

In [78]:
mlflow.start_run()

In [None]:
runcount = 1
for each in param_combos:
    
    mlflow.start_run(run_name='NeuralNetwork'+str(runcount), nested=True)

    dnn_model = models.Sequential()
    dnn_model.add(layers.Dense(each[0],input_shape=(23, )))
    dnn_model.add(layers.Dense(each[1]))
    dnn_model.add(layers.Dense(1))

    dnn_model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(learning_rate=each[2]))

    #dnn_model.summary()

    callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights = True)

    history = dnn_model.fit(X_train,y_train,epochs=300,callbacks=[callback], validation_split = 0.2)

    dnn_model, dnn_metrics = train_model(dnn_model, X_train, y_train, X_test, y_test)
    mlflow.log_param("layer1nodes", each[0])
    mlflow.log_param("layer2nodes", each[1])
    
    mlflow.end_run()
    runcount += 1


In [None]:
mlflow.end_run()

# Test

In [18]:
mlflow.autolog()

2025/05/27 23:20:49 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/05/27 23:20:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/05/27 23:20:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2025/05/27 23:20:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/27 23:20:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


In [54]:
mlflow.start_run()

<ActiveRun: >

In [55]:
mlflow.start_run(run_name='RandomForest', nested=True)
rf_model = mlflow.sklearn.load_model(f"models:/Random Forest/1")
_, rf_metrics = train_model(rf_model, X_train, y_train, X_test, y_test)
mlflow.end_run()


  warn(


Train MSE: 19469060.27, R2: 0.89, MAE: 3247.56
Test MSE: 41514612.62, R2: 0.77, MAE: 4452.25


In [56]:
mlflow.start_run(run_name='Ridge', nested=True)
rid_model = mlflow.sklearn.load_model(f"models:/Ridge/1")
_, rid_metrics = train_model(rf_model, X_train, y_train, X_test, y_test)
mlflow.end_run()


Train MSE: 54334118.84, R2: 0.69, MAE: 5256.04
Test MSE: 51513004.86, R2: 0.71, MAE: 5244.22


In [57]:
mlflow.start_run(run_name='NeuralNetwork', nested=True)
dnn_model = mlflow.tensorflow.load_model(f"models:/Neural Network/1")
_, dnn_metrics = train_model(dnn_model, X_train, y_train, X_test, y_test)
mlflow.end_run()










INFO:tensorflow:Assets written to: C:\Users\VICTUS\AppData\Local\Temp\tmpvxpda3is\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\VICTUS\AppData\Local\Temp\tmpvxpda3is\model\data\model\assets


Train MSE: 57514526.23, R2: 0.67, MAE: 5102.12
Test MSE: 53446249.89, R2: 0.70, MAE: 5070.60


In [53]:
mlflow.end_run()

In [59]:
with open('../models/RandomForestRegressor.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('../models/Ridge.pkl', 'wb') as f:
    pickle.dump(rid_model, f)

with open('../models/NeuralNetwork.pkl', 'wb') as f:
    pickle.dump(dnn_model, f)