In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LinearRegression

df = pd.read_excel('Data TA.xlsx')
X = df[['API', 'Res Temp']].values
y = df['Viscosity'].values


In [26]:
# Normalization

y_scaled = y.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y_scaled)

In [37]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((83, 2), (21, 2), (83, 1), (21, 1))

In [38]:
# Define your base learners
base_learner1 = RandomForestRegressor()
base_learner2 = XGBRegressor()

# Define the hyperparameter search space for Random Forest
rf_param_grid = {
    'n_estimators': [40, 80, 100, 120, 140, 160, 180, 200],
    'max_depth': [40, 60, 80, 100, 120],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Define the hyperparameter search space for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform randomized search for Random Forest
rf_random_search = RandomizedSearchCV(
    base_learner1,
    param_distributions=rf_param_grid,
    n_iter=100,
    cv=5,
    random_state=42,
    n_jobs = -1
)

# Perform randomized search for XGBoost
xgb_random_search = RandomizedSearchCV(
    base_learner2,
    param_distributions=xgb_param_grid,
    n_iter=50,
    cv=5,
    random_state=42
)

In [40]:
rf_random_search.fit(X_train, y_train.ravel())
print ('Best Parameters: ', rf_random_search.best_params_, ' \n')

Best Parameters:  {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 80}  



In [41]:
xgb_random_search.fit(X_train, y_train.ravel())
print ('Best Parameters: ', xgb_random_search.best_params_, ' \n')

Best Parameters:  {'subsample': 0.9, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.9}  



In [42]:
# New base learner
random_forest = RandomForestRegressor(n_estimators = 100, min_samples_split = 2, min_samples_leaf = 1, max_features = 'log2', max_depth = 80)
xgboost = XGBRegressor(subsample = 0.9, n_estimators = 100, max_depth = 3, learning_rate = 0.1, colsample_bytree = 0.9)

# Create an instance of the SuperLearner class
super_learner = SuperLearner(scorer=mean_squared_error, random_state=42, verbose=2)

# Add the base learners to the Super Learner
super_learner.add(random_forest)
super_learner.add(xgboost)

# Add meta learner
#super_learner.add_meta(LinearRegression())

# Fit the Super Learner
super_learner.fit(X_train, y_train.ravel())

# Make predictions on the test set
y_pred = super_learner.predict(X_test)
y_pred = y_pred.reshape(-1, 1)

# Denormalization
y_pred = scaler.inverse_transform(y_pred)
y_test = scaler.inverse_transform(y_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R2:", r2)


Fitting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Fit complete                        | 00:00:01

Predicting 2 layers
Processing layer-1             done | 00:00:00
Processing layer-2             done | 00:00:00
Predict complete                    | 00:00:00
Mean Squared Error: 0.8804213324850058
R2: 0.050703262895064105
