In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score
import xgboost as xgb
from scipy.stats import uniform, randint

#### Read in Dataset

In [2]:
df = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/engineered_data.csv', 
                 encoding='utf-8')

  df = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/engineered_data.csv',


In [3]:
# List of columns to drop
columns_to_drop = ['title','body','address','latitude','longitude','geometry']
# Drop the specified columns
df = df.drop(columns=columns_to_drop)

#### Encode Data for Modeling

In [4]:
# Columns to be treated as factor
cols = ["bathrooms", "bedrooms", "fee", "has_photo", "cityname", "state", "source"]
df[cols] = df[cols].astype('category')

In [5]:
# One-hot encode categorical variables
data_encoded = pd.get_dummies(df, columns=cols, drop_first=True)

# Display the dimensions, column names, and structure of the DataFrame after encoding
print(data_encoded.shape)
print(data_encoded.columns)
print(data_encoded.info())

(87063, 475)
Index(['id', 'price', 'square_feet', 'zipcode', 'POPULATION', 'POP_SQMI',
       'SQMI', 'perc_sub25k', 'perc_25-50k', 'perc_50-75k',
       ...
       'source_RealRentals', 'source_RentDigs.com', 'source_RentFeeder',
       'source_RentLingo', 'source_Seattle Rentals', 'source_SpreadMyAd',
       'source_Z57', 'source_rentbits', 'source_tenantcloud', 'source_vFlyer'],
      dtype='object', length=475)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87063 entries, 0 to 87062
Columns: 475 entries, id to source_vFlyer
dtypes: float64(11), int64(32), uint8(432)
memory usage: 64.4 MB
None


In [6]:
X = data_encoded.drop(['price'], axis=1)  # 'price' is the dependent variable
y = data_encoded['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Linear Regession

In [7]:
# Create and fit the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)

R-squared: 0.6738011477226288


#### XGBoost

In [8]:
# Create and fit the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_model.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)

R-squared: 0.7751607278439088


##### XGBOOST Hyperparam Tuning (GRID)

In [9]:
# Define hyperparameters for tuning
param_grid = {
    'eta': [0.1, 0.25],                     # 'learning_rate' Step size shrinkage used in update to prevent overfitting
    'n_estimators': [600],                  # Number of boosting rounds (trees) to be run
    'max_depth': [10,15],                   # Maximum depth of a tree
    'gamma': [0,1],                         # Minimum loss reduction required to make a further partition on a leaf node
    'min_child_weight': [0.5, 2],           # Minimum sum of instance weight (hessian) needed in a child
    'subsample': [0.75],                    # Subsample ratio of the training instances
    'lambda': [0,1],                        # L2 regularization term on weights
    'alpha': [0,1],                         # L1 regularization term on weights
}

In [11]:
# Create the XGBoost model with GPU support
# xgb_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', random_state=42, verbosity=2)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, verbosity=2)

# Perform grid search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='r2',verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Train the model with the best parameters
# final_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', **best_params, random_state=42)
final_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params, random_state=42)
final_model.fit(X_train, y_train)

# Save the final model in JSON format
final_model.save_model('xgboost_final_model2.json')

# Load the saved model in JSON format
loaded_model = xgb.XGBRegressor()
loaded_model.load_model('xgboost_final_model2.json')

# Predict on the test set using the loaded model
y_pred_loaded = loaded_model.predict(X_test)

# Calculate R-squared using the loaded model
r_squared_loaded = r2_score(y_test, y_pred_loaded)
print("Best Parameters:", best_params)
print("R-squared (Loaded Model):", r_squared_loaded)

R-squared (Loaded Model): 0.8569540299075296


Best Parameters: {'alpha': 1, 'eta': 0.1, 'gamma': 0, 'lambda': 1, 'max_depth': 10, 'min_child_weight': 0.5, 'n_estimators': 600, 'subsample': 0.75}

#### Use Model to Predict Prices

In [12]:
# Predict on the entire dataset
data_encoded['predicted_price'] = loaded_model.predict(X)
data_encoded['price_delta'] = data_encoded['price'] - data_encoded['predicted_price']

# Display the original dataset with the predicted prices
print(data_encoded[['id','price', 'predicted_price','price_delta']])

               id   price  predicted_price  price_delta
0      5668640009  2195.0      2159.190186    35.809814
1      5668639818  1250.0      1317.267944   -67.267944
2      5668639686  1395.0      1580.810547  -185.810547
3      5668639659  1600.0      1710.670776  -110.670776
4      5668639374   975.0       861.394897   113.605103
...           ...     ...              ...          ...
87058  5121219946   780.0       806.076233   -26.076233
87059  5121219696   813.0       851.545471   -38.545471
87060  5121219420  1325.0      1258.522339    66.477661
87061  5121218935   931.0       980.949707   -49.949707
87062  5121218844  1595.0      1680.164551   -85.164551

[87063 rows x 4 columns]


In [13]:
data_encoded[['id','price', 'predicted_price','price_delta']]

Unnamed: 0,id,price,predicted_price,price_delta
0,5668640009,2195.0,2159.190186,35.809814
1,5668639818,1250.0,1317.267944,-67.267944
2,5668639686,1395.0,1580.810547,-185.810547
3,5668639659,1600.0,1710.670776,-110.670776
4,5668639374,975.0,861.394897,113.605103
...,...,...,...,...
87058,5121219946,780.0,806.076233,-26.076233
87059,5121219696,813.0,851.545471,-38.545471
87060,5121219420,1325.0,1258.522339,66.477661
87061,5121218935,931.0,980.949707,-49.949707


#### Add Predicted Prices to Original Dataset

In [16]:
df_orig = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/engineered_data.csv', 
                 encoding='utf-8')

final = df_orig.merge(data_encoded[['id','predicted_price','price_delta']], on='id', how='left')

  df_orig = pd.read_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/engineered_data.csv',


In [17]:
final.to_csv('C:/Users/sega9/Documents/GA Tech/2024.Spring/CSE 6242/Group Project/ApartmentRent/DataModified/predicted_data.csv', encoding='utf-8',index= False)
