In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Prepare data and setup for linear regression
Ames = pd.read_csv("Ames.csv")
y = Ames["SalePrice"]
linear_model = LinearRegression()

# Perform 5-fold cross-validation without Pipeline
cv_score = cross_val_score(linear_model, Ames[["OverallQual"]], y).mean()
print("Example Without Pipeline, Mean CV R^2 score for 'OverallQual': {:.3f}".format(cv_score))

# Perform 5-fold cross-validation WITH Pipeline
pipeline = Pipeline([("regressor", linear_model)])
pipeline_score = cross_val_score(pipeline, Ames[["OverallQual"]], y, cv=5).mean()
print("Example With Pipeline, Mean CV R^2 for 'OverallQual': {:.3f}".format(pipeline_score))

Example Without Pipeline, Mean CV R^2 score for 'OverallQual': 0.618
Example With Pipeline, Mean CV R^2 for 'OverallQual': 0.618


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import pandas as pd

# Prepare data and setup for linear regression
Ames = pd.read_csv("Ames.csv")
y = Ames["SalePrice"]
linear_model = LinearRegression()

# Perform 5-fold cross-validation without Pipeline
Ames["OWA"] = Ames["OverallQual"] * Ames["GrLivArea"]
cv_score_2 = cross_val_score(linear_model, Ames[["OWA"]], y, cv=5).mean()
print(
    "Example Without Pipeline, Mean CV R^2 score for 'Quality Weighted Area': "
    "{:.3f}".format(cv_score_2)
)

# WITH Pipeline
# Define the transformation function for "QualityArea"
def create_quality_area(X):
    X = X.copy()
    X["QualityArea"] = X["OverallQual"] * X["GrLivArea"]
    return X[["QualityArea"]].values

# Setup the FunctionTransformer using the function
quality_area_transformer = FunctionTransformer(create_quality_area)

# Pipeline using the engineered feature "QualityArea"
pipeline_2 = Pipeline([
    ("quality_area_transform", quality_area_transformer),
    ("regressor", linear_model)
])

pipeline_score_2 = cross_val_score(
    pipeline_2,
    Ames[["OverallQual", "GrLivArea"]],
    y,
    cv=5
).mean()

# Output the mean CV scores rounded to four decimal places
print(
    "Example With Pipeline, Mean CV R^2 score for 'Quality Weighted Area': "
    "{:.3f}".format(pipeline_score_2)
)

Example Without Pipeline, Mean CV R^2 score for 'Quality Weighted Area': 0.748
Example With Pipeline, Mean CV R^2 score for 'Quality Weighted Area': 0.748


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

# Prepare data and setup for linear regression
Ames = pd.read_csv("Ames.csv")
y = Ames["SalePrice"]
linear_model = LinearRegression()

# Function to apply cubic transformation
def cubic_transformation(x):
    return x ** 3

# Function to create "QualityArea"
def create_quality_area(X):
    X = X.copy()
    X["QualityArea"] = X["OverallQual"] * X["GrLivArea"]
    return X[["QualityArea"]].values

# Setup the FunctionTransformer for cubic and quality area transformations
cubic_transformer = FunctionTransformer(cubic_transformation)
quality_area_transformer = FunctionTransformer(create_quality_area)

# Setup ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("cubic", cubic_transformer, ["OverallQual"]),
        ("quality_area_transform", quality_area_transformer, ["OverallQual", "GrLivArea"]),
        ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"),
         ["Neighborhood", "ExterQual", "KitchenQual"]),
        ("passthrough", "passthrough", ["YearBuilt"])
    ]
)

# Create the pipeline with the preprocessor and linear regression
pipeline_3 = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", linear_model)
])

# Evaluate the pipeline using 5-fold cross-validation
pipeline_score_3 = cross_val_score(pipeline_3, Ames, y, cv=5).mean()

# Output the mean CV scores rounded to four decimal places
print("Mean CV R^2 score with enhanced transformations: {:.3f}".format(pipeline_score_3))

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load data
Ames = pd.read_csv("Ames.csv")
y = Ames["SalePrice"]
linear_model = LinearRegression()

# Function to apply cubic transformation
def cubic_transformation(x):
    return x ** 3

# Function to create "QualityArea"
def create_quality_area(X):
    X = X.copy()
    X["QualityArea"] = X["OverallQual"] * X["GrLivArea"]
    return X[["QualityArea"]].values

# Setup the FunctionTransformer for cubic and quality area transformations
cubic_transformer = FunctionTransformer(cubic_transformation)
quality_area_transformer = FunctionTransformer(create_quality_area)

# Prepare the BsmtQual imputation and encoding within a nested pipeline
bsmt_qual_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Setup ColumnTransformer for all preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("bsmt_qual", bsmt_qual_transformer, ["BsmtQual"]),
        ("cubic", cubic_transformer, ["OverallQual"]),
        ("quality_area_transform", quality_area_transformer, ["OverallQual", "GrLivArea"]),
        ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"), ["Neighborhood", "ExterQual", "KitchenQual"]),
        ("passthrough", "passthrough", ["YearBuilt"])
    ]
)

# Create the pipeline with the preprocessor and linear regression
pipeline_4 = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", linear_model)
])

# Evaluate the pipeline using 5-fold cross-validation
pipeline_score = cross_val_score(pipeline_4, Ames, y, cv=5).mean()

# Output the mean CV scores rounded to four decimal places
print("Mean CV R^2 score with imputing & transformations: {:.3f}".format(pipeline_score))

Mean CV R^2 score with imputing & transformations: 0.856


