In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from wrangle import train_val_test, xy_split, scale_data, clean_and_convert
from model import eval_model, train_model


from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Regression Modeling 

#### 1) Select a dataset with a continuous target variable.

In [2]:
# Call the clean_and_convert() function to obtain a cleaned and converted DataFrame.
df = clean_and_convert()

# Display the first few rows of the DataFrame 'df' to inspect the data.
df.head()

Unnamed: 0,bedrooms,bathrooms,squarefeet,tax_value,year_built,tax_amount,fips,bedrooms_bin,bathrooms_bin,squarefeet_bin,decades
4,4,2,3633,296425,2005,6941,6037,6,3,4000,2010
6,3,4,1620,847770,2011,10244,6037,4,7,2000,2020
7,3,2,2077,646760,1926,7924,6037,4,3,2500,1930
11,0,0,1200,5328,1972,91,6037,2,1,1500,1980
14,0,0,171,6920,1973,255,6037,2,1,500,1980


#### 2) Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [3]:
# Calculate the sum of missing values (NaN) for each column in the DataFrame 'df'.
df.isna().sum()

bedrooms          0
bathrooms         0
squarefeet        0
tax_value         0
year_built        0
tax_amount        0
fips              0
bedrooms_bin      0
bathrooms_bin     0
squarefeet_bin    0
decades           0
dtype: int64

In [4]:
# Split the DataFrame 'df' into training, validation, and test sets using the train_val_test function.
train, val, test = train_val_test(df)

In [5]:
# Check the dimensions (number of rows and columns) of the training, validation, and test sets.
train.shape, val.shape, test.shape

((1477408, 11), (316588, 11), (316588, 11))

#### 3) Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [6]:
# Define a list of column names to be scaled, excluding the 'tax_value' column.
to_scale = df.drop(columns=['tax_value', 'tax_amount']).columns

# Scale the training, validation, and test datasets using the specified columns.
train, val, test = scale_data(train, val, test, to_scale)

train.head()

Unnamed: 0,bedrooms,bathrooms,squarefeet,tax_value,year_built,tax_amount,fips,bedrooms_bin,bathrooms_bin,squarefeet_bin,decades
34469,0.3125,0.1,0.297519,131067,0.448276,2354,0.0,0.173913,0.064516,0.222222,0.454545
857921,0.25,0.1,0.261905,209578,0.517241,2743,0.0,0.173913,0.064516,0.222222,0.545455
1259741,0.1875,0.1,0.270508,397364,0.508621,5004,1.0,0.086957,0.064516,0.222222,0.454545
1094500,0.1875,0.1,0.252301,99366,0.655172,1480,0.0,0.086957,0.064516,0.222222,0.636364
1685666,0.25,0.1,0.271309,76733,0.448276,1171,0.0,0.173913,0.064516,0.222222,0.454545


In [7]:
# Split the 'train' dataset into feature matrix (X_train) and target vector (y_train).
X_train, y_train = xy_split(train)

# Split the 'val' dataset into feature matrix (X_val) and target vector (y_val).
X_val, y_val = xy_split(val)

#### Baseline

In [8]:
# Calculate the mean and median of the target vector 'y_train'.
y_train.mean(), y_train.median()

(425506.29396415886, 324536.5)

In [9]:
# Create a DataFrame 'baselines' with three columns: 'y_actual,' 'y_mean,' and 'y_median.'
baselines = pd.DataFrame({
    'y_actual': y_train,        # Actual target values from the training dataset.
    'y_mean': y_train.mean(),  # Mean of the target values from the training dataset.
    'y_median': y_train.median()  # Median of the target values from the training dataset.
})


baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
34469,131067,425506.293964,324536.5
857921,209578,425506.293964,324536.5
1259741,397364,425506.293964,324536.5
1094500,99366,425506.293964,324536.5
1685666,76733,425506.293964,324536.5


In [10]:
# Evaluate the model's performance by comparing its predictions against the mean ('y_mean') of the target values.
eval_model(baselines.y_actual, baselines.y_mean)

452297.02041525557

In [11]:
# Evaluate the model's performance by comparing its predictions against the median ('y_median') of the target values.
eval_model(baselines.y_actual, baselines.y_median)

463430.13925475616

#### Linear Regression

In [12]:
# Create an instance of the Linear Regression model.
lm = LinearRegression()

# Train the Linear Regression model on the training data (X_train, y_train).
# Evaluate its performance using the validation data (X_val, y_val).
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 74826.71752063422.
The validate RMSE is 74788.92674552703.


#### LassoLars

In [13]:
# Create an instance of the LassoLars regression model with alpha=0.5.
ll = LassoLars(alpha=0.5)

# Train the LassoLars model on the training data (X_train, y_train).
# Evaluate its performance using the validation data (X_val, y_val).
train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 74826.78861847745.
The validate RMSE is 74789.29820404304.


#### Polynomial Features

In [14]:
# Create an instance of PolynomialFeatures to perform feature expansion.
poly = PolynomialFeatures()

# Transform the training features (X_train) into polynomial features (X_train_s).
X_train_s = poly.fit_transform(X_train)

# Transform the validation features (X_val) into polynomial features (X_val_s).
X_val_s = poly.fit_transform(X_val)

In [15]:
# Comparing the dimensions (number of rows and columns) of X_train_s and X_train
X_train_s.shape, X_train.shape

((1477408, 66), (1477408, 10))

#### Linear Regression with improved features

In [16]:
# Create an instance of the Linear Regression model.
lm = LinearRegression()

# Train the Linear Regression model on the training data with polynomial features (X_train_s, y_train).
# Evaluate its performance using the validation data with polynomial features (X_val_s, y_val).
train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 72418.2787343921.
The validate RMSE is 72403.13661619343.


#### Tweedie Regressor

In [17]:
# Create an instance of the Tweedie Regressor model.
tweedie = TweedieRegressor()

# Train the Tweedie Regressor model on the training data with polynomial features (X_train_s, y_train).
# Evaluate its performance using the validation data with polynomial features (X_val_s, y_val).
train_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 258099.33064516.
The validate RMSE is 261450.89801900787.


#### Random Forest Regressor

In [18]:
# Create an instance of the RandomForestRegressor model.
rf = RandomForestRegressor()

# Train the RandomForestRegressor model on the training data (X_train, y_train).
# Evaluate its performance using the validation data (X_val, y_val).
train_model(rf, X_train, y_train, X_val, y_val)

The train RMSE is 25361.629830848084.
The validate RMSE is 66418.18770904491.


#### XGBRegressor

In [19]:
# Create an instance of the XGBoost Regressor model.
xgbr = XGBRegressor()

# Train the XGBoost Regressor model on the training data (X_train, y_train).
# Evaluate its performance using the validation data (X_val, y_val).
train_model(xgbr, X_train, y_train, X_val, y_val)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


The train RMSE is 56836.67888771232.
The validate RMSE is 65614.68301662171.
