In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet

# Load the dataset and remove columns with missing values
Ames = pd.read_csv("Ames.csv").dropna(axis=1)

# Identify numeric and categorical features, excluding "PID" and "SalePrice"
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns

X = Ames[numeric_features.tolist() + categorical_features.tolist()]

# Target variable
y = Ames["SalePrice"]

# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combined preprocessor for both numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Define the model pipelines with preprocessor and regressor
pipelines = {
    "Lasso": Pipeline(steps=[("preprocessor", preprocessor),
                             ("regressor", Lasso(max_iter=20000))]),
    "Ridge": Pipeline(steps=[("preprocessor", preprocessor),
                             ("regressor", Ridge())]),
    "ElasticNet": Pipeline(steps=[("preprocessor", preprocessor),
                                  ("regressor", ElasticNet())])
}

# Perform cross-validation and store results in a dictionary
cv_results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X, y)
    cv_results[name] = round(scores.mean(), 4)

# Output the mean cross-validation scores
print(cv_results)

{'Lasso': np.float64(0.8863), 'Ridge': np.float64(0.8889), 'ElasticNet': np.float64(0.8299)}


In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet

# Load the dataset and remove columns with missing values
Ames = pd.read_csv("Ames.csv").dropna(axis=1)

# Identify numeric and categorical features, excluding "PID" and "SalePrice"
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns

# Prepare feature matrix X and target vector y
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
y = Ames["SalePrice"]

# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combined preprocessor for both numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Define the model pipelines with preprocessor and regressor
pipelines = {
    "Lasso": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Lasso(max_iter=20000))
    ]),
    "Ridge": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge())
    ]),
    "ElasticNet": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", ElasticNet())
    ])
}

# Perform cross-validation and store results in a dictionary
cv_results = {}
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X, y)
    cv_results[name] = round(scores.mean(), 4)

# Output the mean cross-validation scores
print(cv_results)

{'Lasso': np.float64(0.8863), 'Ridge': np.float64(0.8889), 'ElasticNet': np.float64(0.8299)}


In [3]:
# Implement GridSearchCV on Lasso to obtain optimal alpha
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV

# Load dataset and identify numeric and categorical features
Ames = pd.read_csv("Ames.csv").dropna(axis=1)
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns

# Features and target variables
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
y = Ames["SalePrice"]

# Set up transformers and pipelines
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipelines = {
    "Lasso": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Lasso(max_iter=20000))
    ]),
    "Ridge": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge())
    ]),
    "ElasticNet": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", ElasticNet())
    ])
}

# Setup Grid Search for Lasso
# Define range of alpha values for Lasso
alpha = list(range(1, 21, 1))  # Ranges from 1 to 20 in increments of 1
lasso_grid = GridSearchCV(
    estimator=pipelines["Lasso"],
    param_grid={"regressor__alpha": alpha},
    verbose=1  # Prints out progress
)
lasso_grid.fit(X, y)

# Extract the best alpha and best score Lasso
lasso_best_alpha = lasso_grid.best_params_["regressor__alpha"]
lasso_best_score = lasso_grid.best_score_
print(f"Best alpha for Lasso: {lasso_best_alpha}")
print(f"Best cross-validation score: {round(lasso_best_score, 4)}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best alpha for Lasso: 17
Best cross-validation score: 0.8881


In [5]:
# Implement GridSearchCV on Ridge to obtain optimal alpha
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV

# Load dataset and identify numeric and categorical features
Ames = pd.read_csv("Ames.csv").dropna(axis=1)
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns

# Features and target variables
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
y = Ames["SalePrice"]

# Set up transformers and pipelines
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipelines = {
    "Lasso": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Lasso(max_iter=20000))
    ]),
    "Ridge": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge())
    ]),
    "ElasticNet": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", ElasticNet())
    ])
}

# Define range of alpha for Ridge
alpha = list(range(1, 21, 1))  # Ranges from 1 to 20 in increments of 1

# Setup Grid Search for Ridge
ridge_grid = GridSearchCV(
    estimator=pipelines["Ridge"],
    param_grid={"regressor__alpha": alpha},
    verbose=1  # Prints out progress
)
ridge_grid.fit(X, y)

# Extract the best alpha and best score for Ridge
ridge_best_alpha = ridge_grid.best_params_["regressor__alpha"]
ridge_best_score = ridge_grid.best_score_
print(f"Best alpha for Ridge: {ridge_best_alpha}")
print(f"Best cross-validation score: {round(ridge_best_score, 4)}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best alpha for Ridge: 2
Best cross-validation score: 0.8893


In [7]:
# Implement GridSearchCV on ElasticNet to obtain optimal parameters
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV

# Load dataset and identify numeric and categorical features
Ames = pd.read_csv("Ames.csv").dropna(axis=1)
numeric_features = Ames.select_dtypes(include=["int64", "float64"]) \
    .drop(columns=["PID", "SalePrice"]).columns
categorical_features = Ames.select_dtypes(include=["object"]).columns

# Features and target variables
X = Ames[numeric_features.tolist() + categorical_features.tolist()]
y = Ames["SalePrice"]

# Set up transformers and pipelines
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipelines = {
    "Lasso": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Lasso(max_iter=20000))
    ]),
    "Ridge": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge())
    ]),
    "ElasticNet": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", ElasticNet())
    ])
}

# Define range of alpha for ElasticNet
alpha = list(range(1, 21, 1))  # Ranges from 1 to 20 in increments of 1
# Define range of L1 ratio for ElasticNet
l1_ratio = [0.05, 0.5, 0.95]

# Setup Grid Search for ElasticNet
elasticnet_grid = GridSearchCV(
    estimator=pipelines['ElasticNet'],
    param_grid={
        'regressor__alpha': alpha,
        'regressor__l1_ratio': l1_ratio
    },
    verbose=1  # Prints out progress
)
elasticnet_grid.fit(X, y)

# Extract the best parameters and best score for ElasticNet
elasticnet_best_params = elasticnet_grid.best_params_
elasticnet_best_score = elasticnet_grid.best_score_

print(f"Best parameters for ElasticNet: {elasticnet_best_params}")
print(f"Best cross-validation score: {round(elasticnet_best_score, 4)}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters for ElasticNet: {'regressor__alpha': 1, 'regressor__l1_ratio': 0.95}
Best cross-validation score: 0.8762


  _data = np.array(data, dtype=dtype, copy=copy,
