### Spark mllib coding devotional

#### learning goal:
- Understand basic model building with the mllib library
- Understand key differences from Scikit-learn

#### Building an XGBoost Model: Spark vs Python

import necessary tools

In [2]:
#SCIKIT & other necessary libraries
!pip install scikit-learn seaborn matplotlib xgboost pandas numpy plotly
!pip install lets-plot

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# PYTHON - SCIKIT / Pandas

# import skikit.learn (sklearn) models, functions and preprocessing code
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier 

from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import train_test_split 

from sklearn import metrics 
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             mean_squared_error,
                             mean_absolute_error,
                             r2_score,
                             confusion_matrix,
                             accuracy_score,
                             precision_score)

# import the model and hyperparameter tuning functions
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import LeavePOut

# Visualization
!pip install lets-plot
from lets_plot import *
LetsPlot.setup_html()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


load dataset

In [4]:
# PYTHON - Pandas
df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')
df.drop('date', axis=1, inplace=True) # remove this column, it messes with the model we are using
df = df.query('bedrooms <= 12') #there is a house that has an absurd number of rooms so we need to remove it as an outlier

In [5]:
df.columns

Index(['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'price'],
      dtype='object')

Create a regression model (reg1)

In [6]:
df2 = df.copy() # don't want to mess with the original dataset
y = df2['price'] # choose a target variable from the testing dataset
X = df2.drop('price', axis=1) # remove the target from the testing dataset


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a baseline xgboost model
reg1 = XGBRegressor()
reg1.fit(X_train, y_train)

# run the data through the model and check tested vs predictions

y_pred = reg1.predict(X_test)

#check the classification report on how the model did
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))


0.8698035296154463
17579100346.97157
68997.35462109375


In [7]:
# Get feature importances
importances = reg1.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='importance', ascending=False)

# print(importance_df) #(add this line to see exact values for the features)

features = px.bar(importance_df,
                  x='importance',
                  y='feature',
                  orientation='h',
                  title='Feature Importances in Base XGBoost Model',
                  height= 800)
features.show()

hyperparameter tuning / k-fold validation to prep for model2:

In [8]:
# PYTHON -SCIKIT

# tools used 
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [3, 4],
    'learning_rate': [0.1, 0.15],
    'subsample': [0.8, 0.9, 1.0],
}

# Set up the grid search
grid_search = GridSearchCV(
    estimator=reg1,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error', # For regression, to minimize RMSE
    cv=3, # Cross-validation (k-fold) splitting strategy
    verbose=1,
    n_jobs=-1 # Use all available cores
)

# Fit the grid search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [9]:
# Best parameters and model performance
print("Best Hyperparameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)
print("R-squared:", grid_search.best_estimator_.score(X_test, y_test))

Best Hyperparameters: {'learning_rate': 0.15, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8}
Best RMSE: 122088.59645082681
R-squared: 0.880816120135671


Regression model when using Hyperparameter tuning (reg2)

In [10]:
# build out the model based on the best parameters
reg2 = XGBRegressor(
                    learning_rate = 0.15,
                    max_depth = 4,
                    n_estimators = 300,
                    subsample = 1.0)

# fit the model 2 xgboost model
reg2.fit(X_train, y_train)

# run the data through the model and check tested vs predictions

y_pred = reg2.predict(X_test)

#check the classification report on how the model did
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.8696875377566029
17594761543.623188
68002.18127539063


In [11]:
# previous model scores
# 0.8772387477317761
# 16575198743.629103
# 69075.43608984374

In [12]:
import pickle

with open('xgboost_model.pkl', 'wb') as model_file:
    pickle.dump(reg2, model_file)

In [13]:
import pickle
import base64
import io

# Convert the trained model to base64 string
bytes_pickle = io.BytesIO()
pickle.dump(reg2, bytes_pickle)
bytes_pickle.seek(0)
base64_str = base64.b64encode(bytes_pickle.read()).decode('utf-8')

# Save to text file
with open('xgb_model_string.txt', 'w') as f:
    f.write(base64_str)

print(f"Model saved to text file 'xgb_model_string.txt'")
print(f"String length: {len(base64_str)} characters")

# Optional: Display first 100 characters of the string
print(f"First 100 characters of string: {base64_str[:100]}...")

Model saved to text file 'xgb_model_string.txt'
String length: 676484 characters
First 100 characters of string: gASVQgMAAAAAAACMD3hnYm9vc3Quc2tsZWFybpSMDFhHQlJlZ3Jlc3NvcpSTlCmBlH2UKIwMbl9lc3RpbWF0b3JzlE0sAYwJb2Jq...


In [14]:
# Instead of pickle, save the model using XGBoost's native format
reg2.save_model('xgb_model.json')

# Then read the file and upload it to GitHub
with open('xgb_model.json', 'r') as f:
    model_json = f.read()