In [4]:
class VIFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=5.0):
        self.threshold = threshold
        self.features_to_keep = None
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Store feature names from the training data
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns
        else:
            self.feature_names_in_ = [f'feature_{i}' for i in range(X.shape[1])]
        
        # Convert X to DataFrame if it is a NumPy array
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        # Calculate VIF for each feature
        vif_data = pd.DataFrame()
        vif_data["feature"] = X.columns
        vif_data["VIF"] = [sm.OLS(X[col], sm.add_constant(X.drop(columns=[col]))).fit().rsquared for col in X.columns]
        vif_data["VIF"] = [1 / (1 - rsq) if rsq < 1 else np.inf for rsq in vif_data["VIF"]]
        
        # Keep features with VIF below the threshold
        self.features_to_keep = vif_data[vif_data["VIF"] < self.threshold]["feature"].tolist()
        return self

    def transform(self, X):
        # Convert X to DataFrame if it is a NumPy array
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        # Return DataFrame with only the selected features
        return X[self.features_to_keep]

    def attrib(self, X):
        # Convert X to DataFrame if it is a NumPy array
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        # Return selected features and their VIF values
        return {"features_to_keep": self.features_to_keep, "VIF Values": X[self.features_to_keep]}


NameError: name 'BaseEstimator' is not defined

In [1]:
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.impute import SimpleImputer

# Assuming VIFSelector is already defined and imported

# Load the dataset
df_scaled = pd.read_excel('tri_q4_scaled.xlsx')

# Filter for Tri_Q4 subset
tri_q4_df = df_scaled[df_scaled['_Alkene_Type'] == 'Tri_Q4']

# Prepare features and target
X = tri_q4_df.drop(columns=['Rxn ID', 'Reactant ID', 'Catalyst ID', 'ddG', '_Alkene_Type', 'ee', 'Scaled ee', 'Scaled ΔΔG (J/mol)'])
y = tri_q4_df['Scaled ΔΔG (J/mol)']

# Split the data into training and testing sets (80% train, 20% test) randomly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('vif', VIFSelector()),  # Apply VIF selector to remove multicollinear features
    ('lr', LinearRegression())  # Linear regression model
])

# Set up cross-validation (randomly splitting training data for 5-fold CV)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation on the training set only (no test set leakage)
from sklearn.model_selection import cross_val_score

# Evaluate using cross-validation and calculate mean cross-validated R² and MAE
cv_r2_scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring='r2')
cv_mae_scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')

# Now fit the model on the entire training set and predict on the test set
pipe.fit(X_train, y_train)
y_test_pred = pipe.predict(X_test)

# Calculate R² and MAE for the test set
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Output results
print(f"Mean Cross-validated R² on Training Set: {cv_r2_scores.mean():.4f}")
print(f"Mean Cross-validated MAE on Training Set: {-cv_mae_scores.mean():.4f}")  # Negative MAE due to scoring convention
print(f"R² on Test Set: {test_r2:.4f}")
print(f"MAE on Test Set: {test_mae:.4f}")


NameError: name 'VIFSelector' is not defined