In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error, mean_squared_error
import xgboost as xgb
from scipy.stats import uniform, randint


In [3]:
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

#### Read in Dataset

In [4]:
df = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/feature_engineered_data.csv', 
                 encoding='utf-8')

  df = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/feature_engineered_data.csv',


In [5]:
# List of columns to drop
columns_to_drop = ['title','body','address','latitude','longitude','geometry']
# Drop the specified columns
df = df.drop(columns=columns_to_drop)

#### Encode Data for Modeling

In [6]:
# Columns to be treated as factor
cols = ["bathrooms", "bedrooms", "fee", "has_photo", "cityname", "state", "source"]
df[cols] = df[cols].astype('category')

In [7]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(df, columns=cols, drop_first=True)

# Display the dimensions, column names, and structure of the DataFrame after encoding
print(data_encoded.shape)
print(data_encoded.columns)
print(data_encoded.info())

(87063, 475)
Index(['id', 'price', 'square_feet', 'zipcode', 'POPULATION', 'POP_SQMI',
       'SQMI', 'perc_sub25k', 'perc_25-50k', 'perc_50-75k',
       ...
       'source_RealRentals', 'source_RentDigs.com', 'source_RentFeeder',
       'source_RentLingo', 'source_Seattle Rentals', 'source_SpreadMyAd',
       'source_Z57', 'source_rentbits', 'source_tenantcloud', 'source_vFlyer'],
      dtype='object', length=475)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87063 entries, 0 to 87062
Columns: 475 entries, id to source_vFlyer
dtypes: float64(11), int64(32), uint8(432)
memory usage: 64.4 MB
None


In [8]:
X = data_encoded.drop(['price'], axis=1)  # 'price' is the dependent variable
y = data_encoded['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### XGBOOST Hyperparam Tuning (GRID)

Best Parameters: {'alpha': 1, 'eta': 0.1, 'gamma': 0, 'lambda': 1, 'max_depth': 10, 'min_child_weight': 0.5, 'n_estimators': 600, 'subsample': 0.75}

In [9]:
# Define hyperparameters for tuning
param_grid = {
    'eta': [0.1],                     # 'learning_rate' Step size shrinkage used in update to prevent overfitting
    'n_estimators': [250,500],                  # Number of boosting rounds (trees) to be run
    'max_depth': [3,5,10],                   # Maximum depth of a tree
    'gamma': [0,1],                         # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': [0.5,1,2],           # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.75],                    # Subsample ratio of the training instances
    'lambda': [0,1],                        # L2 regularization term on weights
    'alpha': [0,1],                         # L1 regularization term on weights
}

In [10]:
# Create the XGBoost model with GPU support
# xgb_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', random_state=42, verbosity=2)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=2)

# Perform grid search
grid_search = HalvingGridSearchCV(xgb_model, param_grid, cv=5, scoring='r2',verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters
# final_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', **best_params, random_state=42)
final_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
final_model.fit(X_train, y_train)

# Save the final model in JSON format
final_model.save_model('xgboost_final_model2.json')

# Load the saved model in JSON format
loaded_model = xgb.XGBRegressor()
loaded_model.load_model('xgboost_final_model2.json')

# Predict on the test set using the loaded model
y_pred_loaded = loaded_model.predict(X_test)

# Calculate R-squared using the loaded model
r_squared_loaded = r2_score(y_test, y_pred_loaded)  
print("Best Parameters:", best_params)
print("R-squared (Loaded Model):", r_squared_loaded)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 859
max_resources_: 69650
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 144
n_resources: 859
Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_child_weight=0.5, n_estimators=250, subsample=0.75; total time=   0.6s
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_child_weight=0.5, n_estimators=250, subsample=0.75; total time=   0.5s
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_child_weight=0.5, n_estimators=250, subsample=0.75; total time=   0.6s
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_child_weight=0.5, n_estimators=250, subsample=0.75; total time=   0.6s
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_child_weight=0.5, n_estimators=250, subsample=0.75; total time=   0.6s
[CV] END alpha=0, eta=0.1, gamma=0, lambda=0, max_depth=3, min_ch

#### MLFLOW

In [17]:
# Calculate metrics
r_squared = r2_score(y_test, y_pred_loaded)
mean_absolute_err = mean_absolute_error(y_test, y_pred_loaded)
median_absolute_err = median_absolute_error(y_test, y_pred_loaded)
mean_squared_err = mean_squared_error(y_test, y_pred_loaded)

In [18]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(best_params)

    # Log the loss metric
    mlflow.log_metric("r_squared", r_squared)
    mlflow.log_metric("mean_absolute_error", mean_absolute_err)
    mlflow.log_metric("median_absolute_error", median_absolute_err)
    mlflow.log_metric("mean_squared_error", mean_squared_err)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic XGBOOST model, cross validated with 5 folds")

    # Infer the model signature
    signature = infer_signature(X_train, loaded_model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=loaded_model,
        artifact_path="XGBOOST_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="XGBOOST",
    )

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Registered model 'XGBOOST' already exists. Creating a new version of this model...
2024/03/03 13:32:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBOOST, version 4
Created version '4' of model 'XGBOOST'.
