<a href="https://www.kaggle.com/code/darvack/transformer-paper-regression?scriptVersionId=131150865" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/transformer/DatasetB.csv
/kaggle/input/transformer/DatasetA.csv


Here, we have loaded the data and set Furan as the label.
At first, we have used 25 percent of the dataset A as the test set to come up with a good model, and then use this model to test in the dataset B.

In [2]:
ds_A = pd.read_csv("/kaggle/input/transformer/DatasetA.csv")
ds_B = pd.read_csv("/kaggle/input/transformer/DatasetB.csv")

# Splitting train and test
from sklearn.model_selection import train_test_split
train_set_A, test_set_A = train_test_split(ds_A, test_size = 0.25, random_state = 11)

# Setting the labels
y_train_A = train_set_A['Furan']
y_test_A = test_set_A['Furan']

# Dropping the Furan and Health Index columns
X_train_A = train_set_A.drop(["Furan", "HI"], axis = 1)
X_test_A = test_set_A.drop(["Furan", "HI"], axis = 1)

# For DatasetB
y_B = ds_B['Furan']
X_B = ds_B.drop(["Furan", "HI"], axis = 1)

# The code below is for the second case, where we train the data for the whole
# Dataset A and test it on Dataset B
y_A = ds_A['Furan']
X_A = ds_A.drop(["Furan", "HI"], axis = 1)



In [3]:
#ds_A.hist(bins=50, figsize=(20,15))

The code below, drops the columns that we don't need, and only keeps the common features between dataset A and B.

In [4]:
X_train_A = X_train_A.drop(set(ds_A.columns) - set(ds_B.columns), axis=1)
X_test_A = X_test_A.drop(set(ds_A.columns) - set(ds_B.columns), axis=1)
X_A = X_A.drop(set(ds_A.columns) - set(ds_B.columns), axis=1)
X_B = X_B[X_train_A.columns]
X_train_A

Unnamed: 0,H2,Methane,Acetylene,Ethylene,Ethane,Water,Acid,BDV,IFT
109,12.2,53.50,6.9,127.4,48.0,3,0.043,83.0,20
566,30.2,0.00,0.0,2.6,1.1,3,0.005,84.0,39
410,45.6,18.20,0.0,1.6,1.7,5,0.005,87.0,30
316,19.7,38.50,0.0,2.7,41.6,7,0.005,50.0,32
678,11.0,7.60,0.0,0.3,1.6,3,0.005,61.0,42
...,...,...,...,...,...,...,...,...,...
269,13.7,5.10,0.0,0.4,1.1,1,0.005,94.0,36
337,32.9,3.77,0.0,0.6,2.4,6,0.005,79.0,32
91,22.8,3.30,0.0,4.9,3.0,11,0.140,88.0,16
80,61.2,27.30,0.0,25.6,20.8,9,0.099,70.0,17


In [5]:
useful_features = ['Ethylene', 'Ethane', 'Water', 'Acid', 'BDV', 'IFT', 'Methane']

# First case: Training using 75% of the data and testing on the remaining 25%

We have experimented a combination of different models in the ensemble.
Although the results were quite similar, we found that a combination of KNN, svm, mlp and logistic regression works best.
In the code below we have created a voting classifier consist of these models.

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

rf_reg = RandomForestRegressor(n_jobs = -1, max_depth = 50)
# svm_reg = SVR(kernel='linear')
# knn_reg = KNeighborsRegressor(n_neighbors=3)
xgb_reg = XGBRegressor(learning_rate=0.01, n_estimators=300, max_depth=3, subsample=0.7)
mlp_reg = MLPRegressor(hidden_layer_sizes=(100,), max_iter=10000)
ada_reg = AdaBoostRegressor(n_estimators=50, learning_rate=0.003)
lr_reg = LinearRegression()
bay_reg = BayesianRidge()
cat_reg = CatBoostRegressor(iterations=500, learning_rate=0.1, verbose = 0,
                           depth=6, loss_function='RMSE', random_seed=11)
pls_reg = PLSRegression(n_components=2)
rig_reg = Ridge(alpha=1.0)
lgb_reg = lgb.LGBMRegressor()
el_reg = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=0)
bag_reg = BaggingRegressor(
    DecisionTreeRegressor(), n_estimators=50, max_samples=100,
    max_features=1.0,
    bootstrap=True,
    n_jobs=-1)

voting_reg = VotingRegressor(weights = [1,1,3],
  estimators=[#('nn', mlp_reg),
              #('svc', svm_reg),
              #('knn', knn_reg), 
              ('ada', ada_reg),#('by', bay_reg),
              ('xgb', xgb_reg),('cat', cat_reg)
              #('rf', rf_reg),
              #('el', el_reg), ('lgb', lgb_reg)
             ])
voting_reg.fit(X_train_A, y_train_A)

Here is a comparison of different models and the voting classifier.

In [7]:
from sklearn.linear_model import ElasticNet
el_reg = ElasticNet(alpha=0.5, l1_ratio=1, random_state=0)

from sklearn.metrics import mean_squared_error
for reg in (#mlp_reg, #svm_reg,
            ada_reg,
            #knn_reg,
            xgb_reg, rf_reg, bay_reg, el_reg, pls_reg,cat_reg,lgb_reg,bag_reg,
            lr_reg, voting_reg):
    reg.fit(X_train_A, y_train_A)
    y_pred_A = reg.predict(X_test_A)
    y_pred_B = reg.predict(X_B)
    print(reg.__class__.__name__ + " for dataset A:", mean_squared_error(y_test_A, y_pred_A))
    print(reg.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

AdaBoostRegressor for dataset A: 0.5414458671464615
AdaBoostRegressor for dataset B: 2.3154927180508746
XGBRegressor for dataset A: 0.495943901498028
XGBRegressor for dataset B: 1.7703121756309461
RandomForestRegressor for dataset A: 0.4854559922604044
RandomForestRegressor for dataset B: 2.3222361601335777
BayesianRidge for dataset A: 0.558618143195521
BayesianRidge for dataset B: 2.3797434845933285
ElasticNet for dataset A: 0.7413962903811444
ElasticNet for dataset B: 2.1852533234662816
PLSRegression for dataset A: 0.5973335039952
PLSRegression for dataset B: 2.072595530532023
CatBoostRegressor for dataset A: 0.4635471179701005
CatBoostRegressor for dataset B: 1.7404034155050647
LGBMRegressor for dataset A: 0.5597694965222691
LGBMRegressor for dataset B: 2.4020136676134727




BaggingRegressor for dataset A: 0.5187235871389771
BaggingRegressor for dataset B: 2.0390453792131176
LinearRegression for dataset A: 0.5584080811790763
LinearRegression for dataset B: 2.4580137584740305
VotingRegressor for dataset A: 0.47077835647044647
VotingRegressor for dataset B: 1.7260980660381686


In [8]:
xgb_reg.fit(X_train_A, np.array(y_train_A).ravel())
y_pred_A = xgb_reg.predict(X_test_A)
y_pred_B = xgb_reg.predict(X_B)
print(xgb_reg.__class__.__name__ + " for dataset A:", mean_squared_error(y_test_A, y_pred_A))
print(xgb_reg.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

XGBRegressor for dataset A: 0.495943901498028
XGBRegressor for dataset B: 1.7703121756309461


In [9]:
cat_reg = CatBoostRegressor(iterations=1000, learning_rate=0.1, verbose = 0,
                           depth=6, loss_function='RMSE', random_seed=11)
cat_reg.fit(X_train_A, y_train_A)
y_pred_A = cat_reg.predict(X_test_A)
y_pred_B = cat_reg.predict(X_B)
print(cat_reg.__class__.__name__ + " for dataset A:", mean_squared_error(y_test_A, y_pred_A))
print(cat_reg.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

CatBoostRegressor for dataset A: 0.46435366552410456
CatBoostRegressor for dataset B: 1.7373496702380593


# Second case: Training using all of the data from Dataset A

So far we have used 75% of Dataset A to train the data and 25% to test it.
Here, we used all of the data from Dataset A to train, and then test it on Dataset B.

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

rf_reg = RandomForestRegressor(n_jobs = -1, max_depth = 50)
# svm_reg = SVR(kernel='linear')
# knn_reg = KNeighborsRegressor(n_neighbors=3)
xgb_reg = XGBRegressor(learning_rate=0.01, n_estimators=300, max_depth=3, subsample=0.7)
mlp_reg = MLPRegressor(hidden_layer_sizes=(100,), max_iter=10000)
ada_reg = AdaBoostRegressor(n_estimators=50, learning_rate=0.003)
lr_reg = LinearRegression()
bay_reg = BayesianRidge()
cat_reg = CatBoostRegressor(iterations=500, learning_rate=0.1, verbose = 0,
                           depth=6, loss_function='RMSE', random_seed=11)
pls_reg = PLSRegression(n_components=2)
rig_reg = Ridge(alpha=1.0)
lgb_reg = lgb.LGBMRegressor()
el_reg = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=0)
bag_reg = BaggingRegressor(
    DecisionTreeRegressor(), n_estimators=50, max_samples=100,
    max_features=1.0,
    bootstrap=True,
    n_jobs=-1)

voting_reg = VotingRegressor(weights = [1,1,3],
  estimators=[#('nn', mlp_reg),
              #('svc', svm_reg),
              #('knn', knn_reg), 
              ('ada', ada_reg),#('by', bay_reg),
              ('xgb', xgb_reg),('cat', cat_reg)
              #('rf', rf_reg),
              #('el', el_reg), ('lgb', lgb_reg)
             ])
voting_reg.fit(X_A, y_A)

In [11]:
from sklearn.metrics import mean_squared_error
for clf in (mlp_reg, #svm_reg,
            ada_reg,
            #knn_reg,
            xgb_reg, rf_reg,
            lr_reg, voting_reg):
    clf.fit(X_A, y_A)
    y_pred_B = clf.predict(X_B)
    print(clf.__class__.__name__ + " for dataset A:", mean_squared_error(y_test_A, y_pred_A))
    print(clf.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

MLPRegressor for dataset A: 0.46435366552410456
MLPRegressor for dataset B: 4.366969684933045
AdaBoostRegressor for dataset A: 0.46435366552410456
AdaBoostRegressor for dataset B: 1.9347730671284422
XGBRegressor for dataset A: 0.46435366552410456
XGBRegressor for dataset B: 1.7474177678437246
RandomForestRegressor for dataset A: 0.46435366552410456
RandomForestRegressor for dataset B: 2.0169744091528443
LinearRegression for dataset A: 0.46435366552410456
LinearRegression for dataset B: 2.42436481314223
VotingRegressor for dataset A: 0.46435366552410456
VotingRegressor for dataset B: 1.7515023319108716


In [12]:
xgb_reg.fit(X_train_A, np.array(y_train_A).ravel())
y_pred_B = xgb_reg.predict(X_B)
print(clf.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

VotingRegressor for dataset B: 1.7703121756309461


In [13]:
cat_reg.fit(X_A, y_A)
y_pred_B = cat_reg.predict(X_B)
print(cat_reg.__class__.__name__ + " for dataset B:", mean_squared_error(y_B, y_pred_B))

CatBoostRegressor for dataset B: 1.8183769898413695
