In [None]:
import pandas as pd

# Load the data from the provided Excel file
data = pd.read_csv("SQP_database_for_meta_analysis_20240111_comma.csv")

# Define numerical features as provided
numerical_features = [
    'Number of categories', 'Maximum possible value_75', 'Maximum possible value_76',
    'Number of fixed reference points', 'Number of sentences in introduction',
    'Number of words in introduction', 'Number of subordinated clauses in introduction',
    'Number of sentences in the request', 'Number of words in request',
    'Total number of nouns in request for an answer',
    'Total number of abstract nouns in request for an answer',
    'Total number of syllables in request', 'Number of subordinate clauses in request',
    'Number of syllables in answer scale', 'Total number of nouns in answer scale',
    'Total number of abstract nouns in answer scale', 'Position'
]

# Remove rows where 'quality(q^2)' has missing values
data_cleaned = data.dropna(subset=['quality(q^2)'])

# For numerical features, fill missing values with 0
data_cleaned[numerical_features] = data_cleaned[numerical_features].fillna(0)

# Identify categorical features (all columns from 'Domain' to 'Position' not listed as numerical)
feature_columns = data_cleaned.loc[:, 'Domain':'Position'].columns.tolist()
categorical_features = [col for col in feature_columns if col not in numerical_features]

# Convert categorical features to dummy variables
# Note: Missing values in these features will naturally result in zeros for all dummy variables
df_dummies = pd.get_dummies(data_cleaned, columns=categorical_features, dummy_na=False,dtype=float)

for column in feature_columns:
    original_column = data_cleaned[column].copy()
    df_dummies[column+'_ori'] = original_column

# Save the processed data to a new Excel file
processed_file_path = 'SQP_dummyvars_data.csv'
df_dummies.to_csv(processed_file_path, index=False)


In [None]:
col = df_dummies['quality(q^2)'].astype(float).astype(int)
df_dummies['quality(q^2)'] = col.where(col == 1, ('0.' + col.astype(str)).astype(float))

processed_file_path = 'SQP_dummyvars_data.xlsx'
df_dummies.to_excel(processed_file_path, index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Load the processed data
data = pd.read_excel('SQP_dummyvars_data.xlsx')

# Define numerical features
numerical_features = [
    'Number of categories', 'Maximum possible value_75', 'Maximum possible value_76',
    'Number of fixed reference points', 'Number of sentences in introduction',
    'Number of words in introduction', 'Number of subordinated clauses in introduction',
    'Number of sentences in the request', 'Number of words in request',
    'Total number of nouns in request for an answer',
    'Total number of abstract nouns in request for an answer',
    'Total number of syllables in request', 'Number of subordinate clauses in request',
    'Number of syllables in answer scale', 'Total number of nouns in answer scale',
    'Total number of abstract nouns in answer scale'
]

# Define categorical features
categorical_feature_start = data.columns.get_loc('Domain_1')
categorical_feature_end = data.columns.get_loc('Visual or oral presentation_1') + 1
# categorical_feature_end = data.columns.get_loc('Request present in the introduction_1.0') + 1
categorical_features = data.columns[categorical_feature_start:categorical_feature_end]

# Combine all feature columns
all_features = numerical_features + list(categorical_features)

# Define the target and input features
X = data[all_features]
y = data['quality(q^2)']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Convert data to PyTorch tensors and transfer to GPU if available
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float).view(-1, 1).to(device)

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc0 = nn.Linear(len(all_features), 256)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.15)
    
    def forward(self, x):
        x = self.relu(self.fc0(x))
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        return self.fc4(x)

# Initialize the network, transfer it to GPU, define loss function and optimizer
model = Net().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500, eta_min=0)


# Create a DataLoader instance for batch processing
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)




In [None]:
X_test

In [None]:
import copy
# model_best = copy.deepcopy(model)

## without external features

In [None]:
best_test_loss = 1
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        if test_loss < best_test_loss:
            model_best = copy.deepcopy(model)
            best_test_loss = test_loss
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

## without position

In [None]:
best_test_loss = 1
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        if test_loss < best_test_loss:
            model_best = copy.deepcopy(model)
            best_test_loss = test_loss
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
test_outputs.detach().numpy()

In [None]:
test_outputs = model_best(X_test_tensor).cpu()
test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs.detach())
test_loss

In [None]:
y_train

In [None]:
import numpy as np
residuals_train = y_train_tensor.cpu().detach().numpy() - model_best(X_train_tensor).cpu().detach().numpy()
residuals_test = y_test_tensor.cpu().detach().numpy() - test_outputs.detach().numpy()

std_dev_train = np.std(residuals_train)
std_dev_test = np.std(residuals_test)

threshold_train = 3 * std_dev_train
threshold_test = 3 * std_dev_test

outliers_train = np.where(np.abs(residuals_train) > threshold_train)[0]
print("Training set outliers:", outliers_train)

outliers_test = np.where(np.abs(residuals_test) > threshold_test)[0]
print("Test set outliers:", outliers_test)

In [None]:
train_indices = X_train.index
test_indices = X_test.index

outlier_indices_train = train_indices[outliers_train]
outlier_indices_test = test_indices[outliers_test]

outlier_details_train = data.loc[outlier_indices_train, :]
outlier_details_test = data.loc[outlier_indices_test, :]

# print("Training set outlier details:")
# print(outlier_details_train)
# print("\nTest set outlier details:")
# print(outlier_details_test)
outlier_details_train.to_csv("outliers/nn_quality_train.csv")
outlier_details_test.to_csv("outliers/nn_quality_test.csv")

In [None]:
outlier_details_train['Language'].value_counts()

In [None]:
outlier_details_test['Language'].value_counts()

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_train_tensor).cpu()
    test_loss = mean_squared_error(y_train_tensor.cpu(), test_outputs)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# Load the processed data
data = pd.read_excel('SQP_dummyvars_data.xlsx')

# Define numerical features
numerical_features = [
    'Number of categories', 'Maximum possible value_75', 'Maximum possible value_76',
    'Number of fixed reference points', 'Number of sentences in introduction',
    'Number of words in introduction', 'Number of subordinated clauses in introduction',
    'Number of sentences in the request', 'Number of words in request',
    'Total number of nouns in request for an answer',
    'Total number of abstract nouns in request for an answer',
    'Total number of syllables in request', 'Number of subordinate clauses in request',
    'Number of syllables in answer scale', 'Total number of nouns in answer scale',
    'Total number of abstract nouns in answer scale', 'Position'
]

# Define categorical features
categorical_feature_start = data.columns.get_loc('Domain_1')
categorical_feature_end = data.columns.get_loc('Visual or oral presentation_1') + 1
categorical_features = data.columns[categorical_feature_start:categorical_feature_end]

# Combine all feature columns
all_features = numerical_features + list(categorical_features)

# Define the target and input features
X = data[all_features].values
y = data['quality(q^2)'].values

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Convert data to PyTorch tensors and transfer to GPU if available
X_train_tensor = torch.tensor(X_train, dtype=torch.float).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float).view(-1, 1).to(device)

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc0 = nn.Linear(len(all_features), 256)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.fc4 = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.relu(self.fc0(x))
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        return self.fc4(x)

# Initialize the network, transfer it to GPU, define loss function and optimizer
model = Net().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 1000
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs/10, eta_min=0)


# Create a DataLoader instance for batch processing
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')


In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

Epoch 996/1000, Loss: 0.009784822352230549, Test MSE: 0.014666990377008915
Epoch 997/1000, Loss: 0.019383227452635765, Test MSE: 0.01455196738243103
Epoch 998/1000, Loss: 0.007361909840255976, Test MSE: 0.014125244691967964
Epoch 999/1000, Loss: 0.005996024236083031, Test MSE: 0.015218758955597878
Epoch 1000/1000, Loss: 0.012332906015217304, Test MSE: 0.014809948392212391


In [None]:
len(all_features)

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
# Train the model and evaluate on the test set after each epoch
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
    scheduler.step()

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor).cpu()
        test_loss = mean_squared_error(y_test_tensor.cpu(), test_outputs)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Test MSE: {test_loss}')

In [None]:
import pandas as pd

df1 = pd.read_csv('outliers/mbert_dummy_quality_train.csv', index_col=0)
df2 = pd.read_csv('outliers/nn_quality_train.csv', index_col=0)
df3 = pd.read_csv('outliers/rf_quality_train.csv', index_col=0)

merged_df = pd.merge(df1, df2, left_index=True, right_index=True)
final_merged_df = pd.merge(merged_df, df3, left_index=True, right_index=True)

print(final_merged_df)

final_merged_df.to_csv('outliers/common_train_outlier.csv')


In [None]:
data = pd.read_excel('SQP_dummyvars_data.xlsx')
data['Language'].value_counts()

# Random Forest + BERT top 50

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Load data and preprocess
data = pd.read_excel("SQP_dummyvars_data.xlsx")
data.fillna(-1, inplace=True)
features = data.iloc[:, data.columns.get_loc("Domain_ori"):data.columns.get_loc("Position_ori")]
labels = data["quality(q^2)"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define dataset for handling text data
class QualityDataset(Dataset):
    def __init__(self, tokenizer, texts, labels=None):
        self.encodings = tokenizer(texts[0], texts[1], truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Initialize tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_texts = (data.loc[X_train.index, "Request for answer text"].tolist(), 
               data.loc[X_train.index, "Answer options text"].tolist())
test_texts = (data.loc[X_test.index, "Request for answer text"].tolist(), 
              data.loc[X_test.index, "Answer options text"].tolist())
train_dataset = QualityDataset(tokenizer, train_texts, y_train.tolist())
test_dataset = QualityDataset(tokenizer, test_texts, y_test.tolist())

# Generate embeddings using BERT
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

def generate_bert_embeddings(dataset):
    bert_model.eval()
    embeddings = []
    for data in tqdm(dataset, desc="Generating BERT embeddings"):
        inputs = {key: val.to(device) for key, val in data.items() if key != 'labels'}
        inputs = {key: val.unsqueeze(0) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
            cls_embeddings = outputs[0][:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings[0])
    return np.array(embeddings)

train_embeddings = generate_bert_embeddings(train_dataset)
test_embeddings = generate_bert_embeddings(test_dataset)




In [None]:
# Train Random Forest on BERT embeddings only and select top 50 features
rf_model_bert_only = RandomForestRegressor(n_estimators=520, max_depth=50,max_features=50,n_jobs=-1, random_state=42)
rf_model_bert_only.fit(train_embeddings, y_train)

In [None]:
selector = SelectFromModel(rf_model_bert_only, max_features=50, prefit=True)

In [None]:
train_embeddings_reduced = selector.transform(train_embeddings)
test_embeddings_reduced = selector.transform(test_embeddings)



In [None]:
# Combine BERT top 50 features with original features
X_train_combined = np.concatenate((X_train.values, train_embeddings_reduced), axis=1)
X_test_combined = np.concatenate((X_test.values, test_embeddings_reduced), axis=1)

# Train and evaluate Random Forest on the combined feature set
rf_model_combined = RandomForestRegressor(n_estimators=520, max_depth=50, max_features=16,n_jobs=-1, random_state=42)
rf_model_combined.fit(X_train_combined, y_train)
predictions_combined = rf_model_combined.predict(X_test_combined)
mse_combined = mean_squared_error(y_test, predictions_combined)

print("MSE with combined features:", mse_combined)

In [None]:
rf_model = RandomForestRegressor(n_estimators=520, max_depth=50, max_features=16,n_jobs=-1, random_state=42)
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)

mse

## Iteratively select top feautures

In [None]:
# Function to reduce features by half based on feature importances
def reduce_features_by_half(model, train_embeddings, test_embeddings):
    # Get feature importances and select the top 50%
    selector = SelectFromModel(model, threshold="median", prefit=True)
    train_embeddings_reduced = selector.transform(train_embeddings)
    test_embeddings_reduced = selector.transform(test_embeddings)
    return train_embeddings_reduced, test_embeddings_reduced

# Initialize embeddings for the reduction process
train_embeddings_iter = train_embeddings
test_embeddings_iter = test_embeddings

# Iteratively reduce features and update embeddings
for i in range(4):
    print(f"Iteration {i+1}")
    # Train Random Forest on current set of embeddings
    rf_model_iter = RandomForestRegressor(n_estimators=520, max_depth=50,n_jobs=-1,max_features=50, random_state=42)
    rf_model_iter.fit(train_embeddings_iter, y_train)
    
    # Reduce features by half
    train_embeddings_iter, test_embeddings_iter = reduce_features_by_half(rf_model_iter, train_embeddings_iter, test_embeddings_iter)
    
    # Output the number of features remaining after this iteration
    print(f"Number of features after iteration {i+1}: {train_embeddings_iter.shape[1]}")

# Combine the final set of reduced BERT embeddings with original features
X_train_combined = np.concatenate((X_train.values, train_embeddings_iter), axis=1)
X_test_combined = np.concatenate((X_test.values, test_embeddings_iter), axis=1)




In [None]:
# Train and evaluate Random Forest on the combined feature set
rf_model_combined = RandomForestRegressor(n_estimators=520, max_depth=50, max_features=16,n_jobs=-1, random_state=42)
rf_model_combined.fit(X_train_combined, y_train)
predictions_combined = rf_model_combined.predict(X_test_combined)
mse_combined = mean_squared_error(y_test, predictions_combined)

print("Final MSE with iteratively reduced and combined features:", mse_combined)