# Using PCA and Pseudo Labels to reduce RMSE

Using only ***meta features*** and PCA to predict the OOF values of Pawpularity.

I have used different ***regressors*** to ensemble as a ***level 1*** model and generate the Pawpularity Score.

Used those Pawpularity scores as ***Pseudo Labels*** to train a classifier model and generate labels. 

Compared the the change in RMSE between Level1 and Level2 models.


## Please <span style="color:red">upvote</span> if you like :)
And feel free to comment your opinion and anything you want :) <br>
This will motivate me to do more experiments :)

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVR
import optuna
from sklearn.linear_model import LinearRegression, LogisticRegression
import plotly.graph_objects as go
import statistics

In [None]:
df = pd.read_csv('../input/same-old-creating-folds/train_5folds.csv')
df.shape

In [None]:
df.head()

In [None]:
X = df.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
y = df['Pawpularity']

In [None]:
correlations = X.corr()
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1, cmap='RdPu')
fig.colorbar(cax)
ticks = np.arange(0,12,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(X.columns)
ax.set_yticklabels(X.columns)
plt.show()

In [None]:
model = RandomForestRegressor()
model.fit(X, y)
importance1 = model.feature_importances_

model = ExtraTreesRegressor()
model.fit(X, y)
importance2 = model.feature_importances_

model = XGBRegressor()
model.fit(X, y)
importance3 = model.feature_importances_

In [None]:
importance = (importance1+importance2+importance3)/3
d = pd.DataFrame()
d['imp'] = importance
d['f'] = X.columns

d=d.sort_values('imp', ascending=False)

fig, ax = plt.subplots(figsize=(20,7))

ax.barh(d.f, d.imp, color='crimson')
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_tick_params(pad=5)
ax.yaxis.set_tick_params(pad=10)
for s in ['top','bottom','left','right']:
    ax.spines[s].set_visible(False)
    
ax.grid(b=True, color='grey', linestyle='-.', linewidth=0.5, alpha=0.2)
ax.invert_yaxis()
plt.show()


# Linear Regression

In [None]:
rmse = 0

for fold in range(5):

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    
    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    

    lr = LinearRegression()
    
    lr.fit(xtrain, ytrain)
    
    
    ypred = lr.predict(xtest)
    
    
    folddf = valid.copy()
    folddf['Pawpularity_pred'] = ypred
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    
    rmse += rmse_fold/5
print('LR rmse = ', rmse)

# Linear Regression with PCA

In [None]:
rmse = 0

for fold in range(5):

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    
    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    #### Using PCA ####
    pca = PCA(n_components=2)
    xtrain = pca.fit_transform(xtrain)
    xtest = pca.transform(xtest)
    
    lr = LinearRegression()
    
    lr.fit(xtrain, ytrain)
    
    
    ypred = lr.predict(xtest)
    
    
    folddf = valid.copy()
    folddf['Pawpularity_pred'] = ypred
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    
    rmse += rmse_fold/5
print('LR rmse = ', rmse)

# Using SVR

In [None]:
rmse = 0

for fold in range(5):

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    
    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    svr = SVR()
    
    svr.fit(xtrain, ytrain)
    
    ypred = svr.predict(xtest)
      
    folddf = valid.copy()
    folddf['Pawpularity_pred'] = ypred
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    
    rmse += rmse_fold/5

print('='*100)
print('SVR rmse = ', rmse)
print('='*100)

# Using SVR with PCA

In [None]:
rmse = 0

for fold in range(5):

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    
    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    # Using PCA
    pca = PCA(n_components=2)
    xtrain = pca.fit_transform(xtrain)
    xtest = pca.transform(xtest)
    
    svr = SVR()
    
    svr.fit(xtrain, ytrain)
    
    
    
    
    ypred = svr.predict(xtest)
    
    
    folddf = valid.copy()
    folddf['Pawpularity_pred'] = ypred
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    
    rmse += rmse_fold/5
    
print('='*100)
print('SVR rmse = ', rmse)
print('='*100)

# Ensembling

In [None]:
rmse = 0

for fold in range(5):

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    
    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    rf = RandomForestRegressor()
    et = ExtraTreesRegressor()
    xgb = XGBRegressor()
    lr = LinearRegression()
    
    rf.fit(xtrain, ytrain)
    et.fit(xtrain, ytrain)
    xgb.fit(xtrain, ytrain)
    lr.fit(xtrain, ytrain)
    
    ypred1 = rf.predict(xtest)
    ypred2 = et.predict(xtest)
    ypred3 = xgb.predict(xtest)
    ypred4 = lr.predict(xtest)
    ypred = (ypred1+ypred2+ypred3+ypred4)/4


    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    
    rmse += rmse_fold/5
    
print('='*100)
print('Ensemble rmse = ', rmse)
print('='*100)

# Ensembling with PCA

In [None]:
rmse = 0

for fold in range(5):
    y_pred_all = []
    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]

    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    pca = PCA(n_components=5)

    xtrain = pca.fit_transform(xtrain)

    xtest = pca.transform(xtest)

    rf = RandomForestRegressor()
    et = ExtraTreesRegressor()
    xgb = XGBRegressor()
    lr = LinearRegression()
    
    rf.fit(xtrain, ytrain)
    et.fit(xtrain, ytrain)
    xgb.fit(xtrain, ytrain)
    lr.fit(xtrain, ytrain)
    
    ypred1 = rf.predict(xtest)
    ypred2 = et.predict(xtest)
    ypred3 = xgb.predict(xtest)
    ypred4 = lr.predict(xtest)
    ypred = (ypred1+ypred2+ypred3+ypred4)/4

    rmse_fold = mean_squared_error(ytest, ypred, squared=False)

    rmse += rmse_fold/5
    
rmse_i = rmse
    
print('='*100)
print('Ensemble rmse with PCA = ', rmse)
print('='*100)

## Choosing the optimal ***n_components*** of PCA for the final ensemble 

Using optuna to optimise the number of components of PCA

In [None]:
def objective(trial):
    
    components = trial.suggest_int('n_components', 1,10,1)
    
    rmse = 0

    for fold in range(5):
        y_pred_all = []
        train = df[df['kfold']!=fold]
        valid = df[df['kfold']==fold]

        xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
        xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
        ytrain = train['Pawpularity'].values
        ytest = valid['Pawpularity'].values
        
        pca = PCA(n_components=2)
        xtrain = pca.fit_transform(xtrain)
        xtest = pca.transform(xtest)

        rf = RandomForestRegressor()
        et = ExtraTreesRegressor()
        xgb = XGBRegressor()
        lr = LinearRegression()

        rf.fit(xtrain, ytrain)
        et.fit(xtrain, ytrain)
        xgb.fit(xtrain, ytrain)
        lr.fit(xtrain, ytrain)

        ypred1 = rf.predict(xtest)
        ypred2 = et.predict(xtest)
        ypred3 = xgb.predict(xtest)
        ypred4 = lr.predict(xtest)
        ypred = (ypred1+ypred2+ypred3+ypred4)/4

        rmse_fold = mean_squared_error(ytest, ypred, squared=False)

        rmse += rmse_fold/5
        
    return rmse

#### This step is gonna take some time. Don't worry :)       

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

In [None]:
best_params = study.best_params
n = list(best_params.values())[0]
print(f'Optimun number of n_components for PCA in the ensemble is : {n}')

# Using optimum ***n_components*** to create Pseudo labels

In [None]:
test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
test_df = test_df.drop(['Id'], axis=1)

In [None]:
rmse = 0
preds = []
pseudo_labels = pd.DataFrame()


for fold in range(5):
    folddf = pd.DataFrame()

    train = df[df['kfold']!=fold]
    valid = df[df['kfold']==fold]
    xpred = test_df.values

    xtrain = train.drop(['Id', 'Pawpularity', 'kfold'], axis=1).values
    xtest = valid.drop(['Id', 'Pawpularity', 'kfold'], axis=1)
    ytrain = train['Pawpularity'].values
    ytest = valid['Pawpularity'].values
    
    pca = PCA(n_components=n)
    xtrain = pca.fit_transform(xtrain)
    xtest = pca.transform(xtest)
    xpred = pca.transform(xpred)

    rf = RandomForestRegressor()
    et = ExtraTreesRegressor()
    xgb = XGBRegressor()
    lr = LinearRegression()
    
    rf.fit(xtrain, ytrain)
    et.fit(xtrain, ytrain)
    xgb.fit(xtrain, ytrain)
    lr.fit(xtrain, ytrain)
    
    ypred1 = rf.predict(xtest)
    ypred2 = et.predict(xtest)
    ypred3 = xgb.predict(xtest)
    ypred4 = lr.predict(xtest)
    ypred = (ypred1+ypred2+ypred3+ypred4)/4
    
    ypred1_ = rf.predict(xpred)
    ypred2_ = et.predict(xpred)
    ypred3_ = xgb.predict(xpred)
    ypred4_ = lr.predict(xpred)
    ypred_ = (ypred1_+ypred2_+ypred3_+ypred4_)/4
    
    preds.append(np.hstack(ypred_))
    folddf = valid.copy()
    folddf['Pawpularity_pred'] = ypred
    folddf['pseudo_labels'] = ypred/100
    
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    pseudo_labels = pd.concat([pseudo_labels, folddf])
    rmse += rmse_fold/5
    
print('='*100)
print('Ensemble rmse with PCA = ', rmse)
print('='*100)

In [None]:
test_df['pseudo_labels'] = sum(ypred_)/(len(ypred_)*100)

In [None]:
test_df.head()

### Visualisation of the pseudolabels

In [None]:
pseudo_labels.head()

In [None]:
multi_table = pd.DataFrame()
multi_table['Orginal Pawpularity'] = pseudo_labels['Pawpularity'].describe()
multi_table['Predicted Pawpularity'] = pseudo_labels['Pawpularity_pred'].describe()

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table style="margin: 0px auto;><tr style="background-color:ash;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

multi_table([pd.DataFrame(pseudo_labels['Pawpularity'].describe()), pd.DataFrame(pseudo_labels['Pawpularity_pred'].describe())])

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=pseudo_labels['Pawpularity'], name = 'True',))
fig.add_trace(go.Box(y=pseudo_labels['Pawpularity_pred'], name = 'Predicted'))

fig.show()

In [None]:
layout = go.Layout(
    autosize=False,
    width=1200,
    height=700)

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity'], name='Pawpularity', mode='lines', opacity=0.5, marker={'size':3}))
fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity_pred'], name='Prediction', mode='lines', opacity=0.6, marker={'size':3, 'color':'crimson'}))
fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=[np.mean(pseudo_labels['Pawpularity'])]*len(pseudo_labels), name='Pawpularity Mean', mode='lines'))
fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=[np.mean(pseudo_labels['Pawpularity_pred'])]*len(pseudo_labels), name='Pred Mean', mode='lines', marker={'color':'black', 'line': dict(width=200)}))

fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=[statistics.median(pseudo_labels['Pawpularity'])]*len(pseudo_labels), name='Pawpularity Median', mode='lines'))
fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=[statistics.median(pseudo_labels['Pawpularity_pred'])]*len(pseudo_labels), name='Pred Median', mode='lines'))

fig.update_xaxes(visible=False, showticklabels=False)

fig.show()

In [None]:
layout = go.Layout(
    autosize=False,
    width=1200,
    height=700)

fig = go.Figure(layout=layout)

fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity'], name='Pawpularity', mode='markers', opacity=0.7, marker={'size':3}))
fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity_pred'], name='Prediction', mode='markers', opacity=0.9, marker={'size':3, 'color':'crimson'}))

fig.update_xaxes(visible=False, showticklabels=False)

fig.show()

## Important Observations

* The predicted output has less variance and mostly scattered near the mean
* This draws to a conclusion that **meta-features** aren't enough to get the best results

## Let's dive into using * **pseudo-labels** * for training and see if it actually decreases the RMSE

In [None]:
pseudo_labels.head()

# Predicting with pseudo labels with no leak

Using the classifier instead of the Regressor models. 
We are scaling the **Pawpularity** in the range of **0** and **1**.

In [None]:
from sklearn import model_selection

In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    num_bins = int(np.floor(1 + np.log2(len(data))))

    data.loc[:, "bins"] = pd.cut(data["pseudo_labels"], bins=num_bins, labels=False)

    kf = model_selection.StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    data = data.drop("bins", axis=1)

    return data

In [None]:
test_df_folds = create_folds(test_df, 5)

In [None]:
test_df_folds.head()

In [None]:
rmse = 0
rmse_r = 0
final = pd.DataFrame()

for fold in range(5):
    folddf = pd.DataFrame()
    train = test_df_folds[test_df_folds['kfold']!=fold]
    valid = test_df_folds[test_df_folds['kfold']==fold]

    xtrain = train.drop(['pseudo_labels', 'kfold'], axis=1).values
    xtest = valid.drop(['pseudo_labels', 'kfold'], axis=1)
    ytrain = train['pseudo_labels'].values
    ytest = (valid['pseudo_labels'].values)*100
    
    xgbc = XGBClassifier(verbosity=0, silent=True)
    
    xgbc.fit(xtrain, ytrain)
    ypred = xgbc.predict(xtest)

    ypred = ypred*100
    folddf = valid.copy()
    y_pred_rounded = [np.round(y) for y in ypred]
    folddf['Pawpularity_pred'] = [y for y in ypred]
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    rmse_fold_r = mean_squared_error(ytest, y_pred_rounded, squared=False)
    final = pd.concat([final, folddf])

    rmse += rmse_fold/5
    rmse_r += rmse_fold_r/5
    
print('='*100)
print('XGBoost RMSE trained on pseudo-lables = ', rmse)
print('='*100)
print('XGBoost RMSE trained on pseudo-lables (rounded) = ', rmse_r)
print('='*100)

In [None]:
final.head()

# Comparison

Let us now visually compare the **true values** with the **level 1** predictions and the **level 2** predictions

In [None]:
# fig = go.Figure()
# fig.add_trace(go.Box(y=pseudo_labels['Pawpularity'], name = 'True',))
# fig.add_trace(go.Box(y=pseudo_labels['Pawpularity_pred'], name = 'Level 1'))
# fig.add_trace(go.Box(y=final['Pawpularity_pred'], name = 'Level 2'))

# fig.show()

In [None]:
# layout = go.Layout(
#     autosize=False,
#     width=1200,
#     height=700)

# fig = go.Figure(layout=layout)

# fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity'], name='True', mode='markers', opacity=0.6, marker={'size':3}))
# fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity_pred'], name='Level 1', mode='markers', opacity=0.8, marker={'size':3}))
# fig.add_trace(go.Scatter(x=final['Id'], y=final['Pawpularity_pred'], name='Level 2', mode='markers', opacity=1, marker={'size':3}))

# fig.update_xaxes(visible=False, showticklabels=False)

# fig.show()

In [None]:
# layout = go.Layout(
#     autosize=False,
#     width=1200,
#     height=700)

# fig = go.Figure(layout=layout)

# fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity'], name='True', mode='lines', opacity=0.6, marker={'size':3}))
# fig.add_trace(go.Scatter(x=pseudo_labels['Id'], y=pseudo_labels['Pawpularity_pred'], name='Level 1', mode='lines', opacity=0.8, marker={'size':3}))
# fig.add_trace(go.Scatter(x=final['Id'], y=final['Pawpularity_pred'], name='Level 2', mode='lines', opacity=1, marker={'size':3}))

# fig.update_xaxes(visible=False, showticklabels=False)

# fig.show()

In [None]:
# from IPython.core.display import HTML

# def multi_table(table_list):
#     ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
#     '''
#     return HTML(
#         '<table style="margin: 0px auto;><tr style="background-color:ash;">' + 
#         ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
#         '</tr></table>')

# multi_table([pd.DataFrame(pseudo_labels['Pawpularity'].describe()), pd.DataFrame(pseudo_labels['Pawpularity_pred'].describe()), pd.DataFrame(final['Pawpularity_pred'].describe())])

# Conclusion

* The level 1 predictions have lower variance and lesser outliers than the true values
* The level 2 predictions have even lower variance than the level 1 predictions
* Rounded off values increases the rsme

# Now let's use PCA and see what happens

In [None]:
rmse = 0
rmse_r = 0
final2 = pd.DataFrame()

for fold in range(5):
    folddf = pd.DataFrame()

    train = test_df_folds[test_df_folds['kfold']!=fold]
    valid = test_df_folds[test_df_folds['kfold']==fold]

    xtrain = train.drop(['pseudo_labels', 'kfold'], axis=1).values
    xtest = valid.drop(['pseudo_labels', 'kfold'], axis=1)
    ytrain = train['pseudo_labels'].values
    ytest = (valid['pseudo_labels'].values)*100
    
    pca = PCA(n_components=n)
    xtrain = pca.fit_transform(xtrain)
    xtest = pca.transform(xtest)
    
    xgbc = XGBClassifier(verbosity=0, silent=True)
    
    xgbc.fit(xtrain, ytrain)
    ypred = xgbc.predict(xtest)

    ypred = ypred*100
    folddf = valid.copy()
    y_pred_rounded = [np.round(y) for y in ypred]
    folddf['Pawpularity_pred'] = [np.round(y) for y in ypred]
    rmse_fold = mean_squared_error(ytest, ypred, squared=False)
    rmse_fold_r = mean_squared_error(ytest, y_pred_rounded, squared=False)
    final2 = pd.concat([final2, folddf])

    rmse += rmse_fold/5
    rmse_r += rmse_fold_r/5
    
print('='*100)
print('XGBoost (PCA) RMSE trained on pseudo-lables = ', rmse)
print('='*100)
print('XGBoost (PCA) RMSE trained on pseudo-lables (rounded) = ', rmse_r)
print('='*100)

# With Leakage

In [None]:
rmse = 0
final3 = pd.DataFrame()

xtrain = test_df_folds.drop(['kfold', 'pseudo_labels'], axis=1).values
xtest = xtrain.copy()

ytrain = test_df_folds['pseudo_labels'].values*100
ytest = (test_df_folds['pseudo_labels'].values)*100

xgbc = XGBRegressor()

xgbc.fit(xtrain, ytrain)
ypred = xgbc.predict(xtest)


final3 = test_df_folds.copy()
# ypred =  ypred * 100
final3['Pawpularity_pred'] = ypred
rmse = mean_squared_error(ytest, ypred, squared=False)

print('='*100)
print('XGBoost RMSE trained on pseudo-lables (with leakage) = ', rmse)
print('='*100)

In [None]:
final_pred_1 = final['Pawpularity_pred'].values
final_pred_2 = final2['Pawpularity_pred'].values
final_pred_3 = final3['Pawpularity_pred'].values

In [None]:
# print(mean_squared_error(ytest, final_pred_1, squared=False))
# print(mean_squared_error(ytest, final_pred_2, squared=False))
# print(mean_squared_error(ytest, final_pred_3, squared=False))

In [None]:
super_final = (final_pred_1+final_pred_2+final_pred_3)/3
super_final_weighted = (final_pred_1+2*final_pred_2+final_pred_3)/4

In [None]:
test_df['Pawpularity'] = super_final

In [None]:
test_df = test_df.drop(['kfold', 'pseudo_labels', 'bins'], axis=1)

In [None]:
test_df.to_csv('submission.csv', index=False)

In [None]:
# final_rmse = mean_squared_error(ytest, super_final, squared=False)
# final_rmse_w = mean_squared_error(ytest, super_final_weighted, squared=False)

# print('='*100)
# print('Final RMSE = ', final_rmse)
# print('='*100)
# print('Final RMSE (weighted) = ', final_rmse_w)
# print('='*100)

In [None]:
# print('='*100)
# print(f'The raw ensemble for level 1 models scored an RMSE of {rmse_i}')
# print(f'The final ensemble after level 2 models scored an RMSE of {final_rmse_w}')

# print('='*100)
# if rmse_i<final_rmse_w:
#     print(f'The RMSE increased by {final_rmse_w-rmse_i}')
# else:
#     print(f'Level 2 ensemble: The RMSE decreased by {np.round(rmse_i-final_rmse_w, 2)}')
#     print('='*100)
#     print(f'Level 2 PCA XGBoost: The RMSE decreased by {np.round(rmse_i-mean_squared_error(ytest, final_pred_2, squared=False), 2)}')
# print('='*100)

### Conclusions

I ran these experiments a few times and the results are pretty consistent. I can see that there is a certain decrease in the RMSE by using the following methods:

* Using K-folds
* Using PCA over simple regressor/classifier
* Using Pseudo Labels for level 2 model
* Using PCA and pseudo labelling simultaneously
* Only meta features arent enough

### Please do <span style="color:red">upvote</span>. if you like. 💫
### That gives a hell lot of motivation to create kernels and do experiments. 💫
### Happy kaggling!! 💫