In [4]:

# Custom StandardScalerClone
class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = with_mean
    
    def fit(self, X, y=None):
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        return self
    
    def transform(self, X):
        X_scaled = X.copy()
        if self.with_mean:
            X_scaled -= self.mean_
        return X_scaled / self.scale_

# Log transformer
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)

# Create preprocessing pipeline
numeric_features = ["displacement", "horsepower", "weight", "acceleration"]
categorical_features = ["cylinders", "model_year", "origin"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScalerClone())
])

log_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", log_transformer)
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ["horsepower", "acceleration"]),
        ("log", log_transformer, ["displacement", "weight"]),
        ("cat", categorical_transformer, categorical_features),
    ])

# Clustering-based transformer


# Full pipeline
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("cluster_similarity", ClusterSimilarity(n_clusters=5, gamma=1.0, random_state=42))
])

# Linear Regression
lin_reg = make_pipeline(full_pipeline, LinearRegression())
lin_reg.fit(X_train, y_train)
auto_predictions = lin_reg.predict(X_train)
print("Linear Regression predictions (first 5):", auto_predictions[:5].round(2))

lin_rmse = np.sqrt(mean_squared_error(y_train, auto_predictions))
print("Linear Regression RMSE:", lin_rmse)

# Decision Tree Regressor
tree_reg = make_pipeline(full_pipeline, DecisionTreeRegressor(random_state=42))
tree_rmses = -cross_val_score(tree_reg, X_train, y_train,
                              scoring="neg_root_mean_squared_error",
                              cv=10)
print("Decision Tree Cross-Validation RMSE mean:", tree_rmses.mean())

# Random Forest Regressor
forest_reg = make_pipeline(full_pipeline, RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_root_mean_squared_error",
                                cv=10)
print("Random Forest Cross-Validation RMSE mean:", forest_rmses.mean())

# Grid Search for Random Forest
param_grid = [
    {'preprocessor__num__scaler__with_mean': [True, False],
     'cluster_similarity__n_clusters': [5, 8, 10],
     'random_forest__max_features': [4, 6, 8]},
    {'preprocessor__num__scaler__with_mean': [True, False],
     'cluster_similarity__n_clusters': [10, 15],
     'random_forest__max_features': [6, 8, 10]},
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

# Evaluate the best model on the test set
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
print("Final model RMSE on test set:", final_rmse)

Linear Regression predictions (first 5): [15.48 23.23 25.55 23.3  23.23]
Linear Regression RMSE: 6.734483059188804
Decision Tree Cross-Validation RMSE mean: 5.06284403065758
Random Forest Cross-Validation RMSE mean: 4.482743155586975


ValueError: Invalid parameter 'cluster_similarity' for estimator Pipeline(steps=[('pipeline',
                 Pipeline(steps=[('preprocessor',
                                  ColumnTransformer(transformers=[('num',
                                                                   Pipeline(steps=[('imputer',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('scaler',
                                                                                    StandardScalerClone())]),
                                                                   ['horsepower',
                                                                    'acceleration']),
                                                                  ('log',
                                                                   Pipeline(steps=[('imputer',
                                                                                    SimpleImputer(strategy='median')),
                                                                                   ('log',
                                                                                    FunctionTransformer(func=<ufunc 'log'>,
                                                                                                        inverse_func=<ufunc 'exp'>))]),
                                                                   ['displacement',
                                                                    'weight']),
                                                                  ('cat',
                                                                   Pipeline(steps=[('imputer',
                                                                                    SimpleImputer(strategy='most_frequent'))]),
                                                                   ['cylinders',
                                                                    'model_year',
                                                                    'origin'])])),
                                 ('cluster_similarity',
                                  ClusterSimilarity(random_state=42))])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error

# Load the auto mpg dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "name"]
df = pd.read_csv(url, names=column_names, na_values="?", comment="\t", sep=" ", skipinitialspace=True)

# Drop the 'name' column as it's not useful for our analysis
df = df.drop("name", axis=1)

# Split features and target
X = df.drop("mpg", axis=1)
y = df["mpg"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
    
    def fit(self, X, y=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X)
        return self
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

def ratio_pipeline(name=None):
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=lambda input_features: [name]),
        StandardScaler()
    )

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log),
    StandardScaler()
)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent")
)

preprocessing = ColumnTransformer([
    ("horsepower_weight_ratio", ratio_pipeline("horsepower_weight_ratio"), ["horsepower", "weight"]),
    ("displacement_weight_ratio", ratio_pipeline("displacement_weight_ratio"), ["displacement", "weight"]),
    ("log", log_pipeline, ["horsepower", "weight", "displacement"]),
    ("cluster_similarity", ClusterSimilarity(n_clusters=5, gamma=1.0, random_state=42), 
     ["horsepower", "weight", "displacement", "acceleration"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
],
remainder=default_num_pipeline)


# Linear Regression
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)
auto_predictions = lin_reg.predict(X_train)
print("Linear Regression predictions (first 5):", auto_predictions[:5].round(2))

lin_rmse = np.sqrt(mean_squared_error(y_train, auto_predictions))
print("Linear Regression RMSE:", lin_rmse)



ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = make_pipeline(preprocessing,
DecisionTreeRegressor(random_state=42))
tree_reg.fit(X_train, y_train)
auto_predictions = tree_reg.predict(X_train)
print("Decision Tree predictions (first 5):", auto_predictions[:5].round(2))

tree_rmse = np.sqrt(mean_squared_error(y_train, auto_predictions))
print("Decision Tree RMSE:", tree_rmse)


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the auto mpg dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "name"]
df = pd.read_csv(url, names=column_names, na_values="?", comment="\t", sep=" ", skipinitialspace=True)
df = df.drop("name", axis=1)

# Split features and target
X = df.drop("mpg", axis=1)
y = df["mpg"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom StandardScalerClone
class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = with_mean

    def fit(self, X, y=None):
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        return self

    def transform(self, X):
        X_scaled = X.copy()
        if self.with_mean:
            X_scaled -= self.mean_
        return X_scaled / self.scale_

# ClusterSimilarity transformer
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1., random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

# Helper functions
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_pipeline(name=None):
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=lambda input_features: [name]),
        StandardScalerClone()
    )
# Preprocessing pipelines
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),  # Ensure missing values are imputed
    FunctionTransformer(np.log),
    StandardScalerClone()
)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),  # Ensure missing values are imputed
    StandardScalerClone()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent")  # Ensure missing values are imputed
)

geo_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),  # Ensure missing values are imputed
    ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
)

# Main preprocessing pipeline
preprocessing = ColumnTransformer([
    ("horsepower_weight_ratio", ratio_pipeline("horsepower_weight_ratio"), ["horsepower", "weight"]),
    ("displacement_weight_ratio", ratio_pipeline("displacement_weight_ratio"), ["displacement", "weight"]),
    ("log", log_pipeline, ["horsepower", "weight", "displacement", "acceleration"]),
    ("geo", geo_pipeline, ["horsepower", "weight"]),  # Use the updated geo pipeline
    ("cat", cat_pipeline, ["cylinders", "model_year", "origin"]),
],
remainder=default_num_pipeline)

# Linear Regression
lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X_train, y_train)
auto_predictions = lin_reg.predict(X_train)
print("Linear Regression predictions (first 5):", auto_predictions[:5].round(2))
lin_rmse = np.sqrt(mean_squared_error(y_train, auto_predictions))
print("Linear Regression RMSE:", lin_rmse)





Linear Regression predictions (first 5): [14.7  25.14 35.61 35.34 23.8 ]
Linear Regression RMSE: 3.062376431024096
Linear Regression predictions (first 5): [14.7  25.14 35.61 35.34 23.8 ]
Linear Regression RMSE: 3.062376431024096


In [11]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = make_pipeline(preprocessing,
DecisionTreeRegressor(random_state=42))
tree_reg.fit(X_train, y_train)
auto_predictions = tree_reg.predict(X_train)
print("Decision Tree predictions (first 5):", auto_predictions[:5].round(2))

tree_rmse = np.sqrt(mean_squared_error(y_train, auto_predictions))
print("Decision Tree RMSE:", tree_rmse)



Decision Tree predictions (first 5): [16.  27.  37.  36.1 23. ]
Decision Tree RMSE: 0.0


In [14]:
from sklearn.model_selection import cross_val_score
tree_rmses = -cross_val_score(tree_reg, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rmses).describe()

count    10.000000
mean      4.119293
std       0.501744
min       3.440703
25%       3.786239
50%       4.127104
75%       4.320897
max       5.038818
dtype: float64

In [15]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = make_pipeline(preprocessing,
RandomForestRegressor(random_state=42))
forest_rmses = -cross_val_score(forest_reg, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

count    10.000000
mean      2.967987
std       0.559933
min       2.108097
25%       2.549907
50%       2.940715
75%       3.350417
max       3.940770
dtype: float64

In [18]:
from sklearn.model_selection import GridSearchCV
full_pipeline = Pipeline([

("preprocessing", preprocessing),
("random_forest", RandomForestRegressor(random_state=42)),
])

param_grid = [
    {'preprocessing__geo__clustersimilarity__n_clusters': [5, 8, 10],
     'random_forest__max_features': [4, 6, 8]},
    {'preprocessing__geo__clustersimilarity__n_clusters': [10, 15],
     'random_forest__max_features': [6, 8, 10]},
]

grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)

# Evaluate the best model on the test set
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
print("Final model RMSE on test set:", final_rmse)

Best parameters: {'preprocessing__geo__clustersimilarity__n_clusters': 10, 'random_forest__max_features': 8}
Best RMSE: 3.074865394018532
Final model RMSE on test set: 2.118761979317167


  _data = np.array(data, dtype=dtype, copy=copy,
