##s-CO2 Corrosion Severity Prediction##

Tong Liu

**Data Set Characteristics**

In [1]:
import pandas as pd
Data = pd.read_csv('data.csv')

print(Data.describe())

           chrome        temp       press    H2O added         H2O  \
count  278.000000  280.000000  280.000000   280.000000  280.000000   
mean     0.089953   45.571429    9.086500    55.871294    2.199258   
std      0.121527    7.876414    1.007919   328.753192    1.307628   
min      0.000000   25.000000    8.000000     0.020000    0.020000   
25%      0.013000   43.750000    8.000000     0.963600    0.963600   
50%      0.048000   50.000000    9.500000     2.600000    2.600000   
75%      0.110000   50.000000   10.000000    33.497500    3.330000   
max      0.540000   60.000000   12.000000  3850.000000    5.580000   

                 O2           SO2          NO2          H2S         acids  \
count    280.000000    280.000000   280.000000   280.000000    280.000000   
mean    5279.928578   1820.589286    39.096429   118.392857   1978.078571   
std    12146.646989   5544.390425   136.641152   322.194407   5553.768293   
min        0.000000      0.000000     0.000000     0.000000  

**Upload Normalized Data Set**

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Download Data
file_path = './data.csv'
data = pd.read_csv(file_path)

# Define categorical and numerical feature columns
categorical_cols = ['material']
numerical_cols = ['chrome', 'temp', 'press', 'H2O added', 'H2O', 'O2', 'SO2', 'NO2', 'H2S', 'acids', 'time']

# Preprocessor Configuration
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Defining Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Create a complete data processing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Prepare target variables and feature variables. Use corrosion rate and exclude severity
X = data.drop(['CR', 'severity','Refs.'], axis=1)
y = data['CR']

# Divide the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training the model
pipeline.fit(X_train, y_train)

# Extracting feature importance
feature_importances = model.feature_importances_
encoded_features = (pipeline.named_steps['preprocessor']
                    .transformers_[1][1]
                    .named_steps['onehot']
                    .get_feature_names_out(input_features=categorical_cols))
all_features = numerical_cols + list(encoded_features)

# DataFrame of feature importance
feature_importance_df = pd.DataFrame({
    'Feature': all_features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Prediction test set
y_pred = pipeline.predict(X_test)

# Calculate the mean squared error (MSE) for the test set
mse = mean_squared_error(y_test, y_pred)

print("Feature Importances:\n", feature_importance_df)
print("\nMSE on Test Data: ", mse)
# Utilizing the previously transformed and corrected dataframes (X_train_final and X_test_final) without 'severity'
# Apply the preprocessing pipeline to the training and test data
X_train_transformed = pipeline.named_steps['preprocessor'].transform(X_train)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

# Convert the transformed data back to a DataFrame so that we can access the column names
X_train_final = pd.DataFrame(X_train_transformed, columns=all_features, index=X_train.index)
X_test_final = pd.DataFrame(X_test_transformed, columns=all_features, index=X_test.index)

# Defining the most important features
most_important_feature = feature_importance_df.iloc[0]['Feature']
feature_importance_df['Cumulative Importance'] = feature_importance_df['Importance'].cumsum()
# Define the feature set with cumulative importance reaching 80%
cumulative_threshold = 0.80
features_80 = feature_importance_df[feature_importance_df['Cumulative Importance'] <= cumulative_threshold]['Feature'].tolist()

# These features can then be used to set up the dataset and train the model
X_train_1_corrected = X_train_final[[most_important_feature]]
X_test_1_corrected = X_test_final[[most_important_feature]]

features_80_corrected = [f for f in features_80 if f in X_train_final.columns]  # Make sure the features are in the DataFrame
X_train_80_corrected = X_train_final[features_80_corrected]
X_test_80_corrected = X_test_final[features_80_corrected]

X_train_all_corrected = X_train_final  # Use all features
X_test_all_corrected = X_test_final

# Defining and training the model
model_1_corrected = RandomForestRegressor(n_estimators=100, random_state=0)
model_80_corrected = RandomForestRegressor(n_estimators=100, random_state=0)
model_all_corrected = RandomForestRegressor(n_estimators=100, random_state=0)

model_1_corrected.fit(X_train_1_corrected, y_train)
model_80_corrected.fit(X_train_80_corrected, y_train)
model_all_corrected.fit(X_train_all_corrected, y_train)

# Predict and calculate MSE
y_pred_1_corrected = model_1_corrected.predict(X_test_1_corrected)
mse_1_corrected = mean_squared_error(y_test, y_pred_1_corrected)

y_pred_80_corrected = model_80_corrected.predict(X_test_80_corrected)
mse_80_corrected = mean_squared_error(y_test, y_pred_80_corrected)

y_pred_all_corrected = model_all_corrected.predict(X_test_all_corrected)
mse_all_corrected = mean_squared_error(y_test, y_pred_all_corrected)

# Output
print("MSE using most important feature:", mse_1_corrected)
print("MSE using top 80% important features:", mse_80_corrected)
print("MSE using all features:", mse_all_corrected)


Feature Importances:
              Feature  Importance
10              time    0.580318
6                SO2    0.128139
3          H2O added    0.065574
5                 O2    0.059749
9              acids    0.058371
7                NO2    0.027485
4                H2O    0.018962
0             chrome    0.017187
2              press    0.010612
17      material_X70    0.008018
18      material_X80    0.007598
8                H2S    0.006801
16      material_X65    0.006023
1               temp    0.004656
15      material_X60    0.000405
14      material_X52    0.000052
11  material_Unknown    0.000048
12     material_X100    0.000002
13      material_X42    0.000001

MSE on Test Data:  0.7141200611068126
MSE using most important feature: 1.1665684152104971
MSE using top 80% important features: 0.8846009675702445
MSE using all features: 0.7141200611068126


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import xgboost as xgb
import lightgbm as lgb
# Load data
file_path = './data.csv'
data = pd.read_csv(file_path)

# Define categorical and numerical feature columns
categorical_cols = ['material']
numerical_cols = ['chrome', 'temp', 'press', 'H2O added', 'H2O', 'O2', 'SO2', 'NO2', 'H2S', 'acids', 'time']

# Preprocessor Configuration
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Prepare target variables and feature variables, excluding severity and references.
X = data.drop(['CR', 'severity', 'Refs.'], axis=1)
y = data['CR']

# Split into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Setting up the model and parameters
model_params = {
    'RandomForestRegressor': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', RandomForestRegressor(random_state=0))]),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [None, 10, 20],
            'regressor__min_samples_split': [2, 5]
        }
    },
    'GradientBoostingRegressor': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor(random_state=0))]),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__learning_rate': [0.05, 0.1],
            'regressor__max_depth': [3, 5]
        }
    },
        'LinearRegression': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', LinearRegression())]),
        'params': {
            'regressor__fit_intercept': [True, False]
        }
    },
    'RidgeRegression': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', Ridge(random_state=0))]),
        'params': {
            'regressor__alpha': [0.1, 1.0, 10.0]  # Regularization Strength
        }
    },
    'LassoRegression': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', Lasso(random_state=0))]),
        'params': {
            'regressor__alpha': [0.01, 0.1, 1.0]  # Regularization Strength
        }
    },
    'XGBRegressor': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', xgb.XGBRegressor(objective='reg:squarederror', random_state=0))]),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__max_depth': [3, 5],
            'regressor__learning_rate': [0.01, 0.1]
        }
    },
    'LGBMRegressor': {
        'model': Pipeline([('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(random_state=0))]),
        'params': {
            'regressor__n_estimators': [100, 200],
            'regressor__num_leaves': [31, 50],  # Larger num_leaves can improve accuracy but may lead to overfitting.
            'regressor__learning_rate': [0.01, 0.1]
        }
    }
}

# Perform parameter searches and model evaluation
for model_name, mp in model_params.items():
    grid_search = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_  # Convert to positive MSE
    print(f"{model_name} best MSE: {best_score}")
    print(f"Best parameters: {best_params}")

    # Use the best model to evaluate on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} Test MSE: {mse}")


RandomForestRegressor best MSE: 2.232020342323894
Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
RandomForestRegressor Test MSE: 0.7171749767850931
GradientBoostingRegressor best MSE: 1.9662388196781897
Best parameters: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}
GradientBoostingRegressor Test MSE: 0.8452653811393512
LinearRegression best MSE: 3.4695113373952635
Best parameters: {'regressor__fit_intercept': False}
LinearRegression Test MSE: 1.4752599587723336
RidgeRegression best MSE: 3.4467628340263183
Best parameters: {'regressor__alpha': 10.0}
RidgeRegression Test MSE: 1.4092740678841553
LassoRegression best MSE: 3.445851844497697
Best parameters: {'regressor__alpha': 0.01}
LassoRegression Test MSE: 1.4134623544859675
XGBRegressor best MSE: 1.98533233925866
Best parameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
XGB

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Load data
data_path = './data.csv'
data = pd.read_csv(data_path)

# Defining columns
categorical_cols = ['material']
numerical_cols = ['chrome', 'temp', 'press', 'H2O added', 'H2O', 'O2', 'SO2', 'NO2', 'H2S', 'acids', 'time']

# Preprocessor Configuration
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Normalizing data to better fit neural networks
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocessing Data
X = data.drop('CR', axis=1)
y = data['CR']
X_processed = preprocessor.fit_transform(X)
y = y.values.reshape(-1, 1)
y = StandardScaler().fit_transform(y)  # Standardize the target variable

# Transfer PyTorch Tensors
X_tensor = torch.tensor(X_processed, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Creat DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Defining Neural Networks
class RegressionNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RegressionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

# Instantiate Model
model = RegressionNN(X_tensor.shape[1], 128)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training the model
epochs = 50
for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
model.eval()
with torch.no_grad():
    predictions = model(X_tensor)
    test_mse = mean_squared_error(y_tensor, predictions.numpy())
    print(f"Test MSE: {test_mse}")

Epoch 1/50, Loss: 0.3182912766933441
Epoch 2/50, Loss: 0.3273223340511322
Epoch 3/50, Loss: 0.30778348445892334
Epoch 4/50, Loss: 0.06211249530315399
Epoch 5/50, Loss: 0.3899412453174591
Epoch 6/50, Loss: 0.18540729582309723
Epoch 7/50, Loss: 0.09553707391023636
Epoch 8/50, Loss: 0.04527485743165016
Epoch 9/50, Loss: 0.027828631922602654
Epoch 10/50, Loss: 6.810607433319092
Epoch 11/50, Loss: 0.16863785684108734
Epoch 12/50, Loss: 0.17193327844142914
Epoch 13/50, Loss: 0.06148768588900566
Epoch 14/50, Loss: 0.17174164950847626
Epoch 15/50, Loss: 0.057153552770614624
Epoch 16/50, Loss: 0.026752205565571785
Epoch 17/50, Loss: 0.034255094826221466
Epoch 18/50, Loss: 0.016057105734944344
Epoch 19/50, Loss: 0.012616381049156189
Epoch 20/50, Loss: 0.04864725098013878
Epoch 21/50, Loss: 4.541852951049805
Epoch 22/50, Loss: 0.3232661485671997
Epoch 23/50, Loss: 0.02127322368323803
Epoch 24/50, Loss: 0.2953713834285736
Epoch 25/50, Loss: 0.0896243155002594
Epoch 26/50, Loss: 0.00878257676959037

In [28]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import math
# Load data
data = pd.read_csv('./data.csv')

# Defining categorical and numerical features
categorical_cols = ['material']
numerical_cols = ['chrome', 'temp', 'press', 'H2O added', 'H2O', 'O2', 'SO2', 'NO2', 'H2S', 'acids', 'time']

# Preprocessing Configuration
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Data processing
X = data.drop(['CR'], axis=1)
y = data['CR'].values.reshape(-1, 1)
X_processed = preprocessor.fit_transform(X)
y = StandardScaler().fit_transform(y)  # Standardize the target variable

# Divide the dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=0)

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Creating a DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
class LSTMRegressor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMRegressor, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(50, output_size)  # Map the hidden state at the last time step to the output

    def forward(self, x):
        
        output, (hn, cn) = self.lstm(x)

        # The final output is obtained through the fully connected layer
        out = self.fc(output)
        return out

# Setting LSTM parameters
input_size = X_train.shape[1]  # Number of features
hidden_size = 50  # Hidden layer size
num_layers = 1  # Number of LSTM layers
output_size = 1  # Output size

# Instantiate the LSTM model
model = LSTMRegressor(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training
epochs = 100
for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')
# Evaluate on the test set
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    test_mse = mean_squared_error(y_test_tensor.numpy(), predictions.numpy())
    print(f"Test MSE: {test_mse}")

Epoch 1/100, Loss: 0.48673000931739807
Epoch 2/100, Loss: 6.407315731048584
Epoch 3/100, Loss: 0.5053640604019165
Epoch 4/100, Loss: 6.434662818908691
Epoch 5/100, Loss: 0.492488831281662
Epoch 6/100, Loss: 0.15949860215187073
Epoch 7/100, Loss: 0.40599316358566284
Epoch 8/100, Loss: 0.12337502092123032
Epoch 9/100, Loss: 0.06855364888906479
Epoch 10/100, Loss: 0.06211888790130615
Epoch 11/100, Loss: 0.20215964317321777
Epoch 12/100, Loss: 0.10647951066493988
Epoch 13/100, Loss: 0.13553553819656372
Epoch 14/100, Loss: 0.5477786660194397
Epoch 15/100, Loss: 0.2989196479320526
Epoch 16/100, Loss: 0.2927747368812561
Epoch 17/100, Loss: 0.33737170696258545
Epoch 18/100, Loss: 0.1285097599029541
Epoch 19/100, Loss: 5.650084018707275
Epoch 20/100, Loss: 0.15467923879623413
Epoch 21/100, Loss: 0.12553785741329193
Epoch 22/100, Loss: 0.1070401519536972
Epoch 23/100, Loss: 5.471078395843506
Epoch 24/100, Loss: 0.09327585250139236
Epoch 25/100, Loss: 0.12377150356769562
Epoch 26/100, Loss: 0.124

In [29]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerRegressor(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_encoder_layers, dim_feedforward, output_size, dropout=0.1):
        super(TransformerRegressor, self).__init__()
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.input_fc = nn.Linear(input_size, d_model)
        transformer_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_encoder_layers)
        self.output_fc = nn.Linear(d_model, output_size)

    def forward(self, src):
        src = self.input_fc(src)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.output_fc(output[:, -1, :])
        return output
# Transformer parameter
d_model = 128  # Dimensions of input features after transformation
nhead = 8  # Number of heads in the attention mechanism
num_encoder_layers = 3  # Number of encoder layers
dim_feedforward = 512  # Dimensions of a feedforward network
dropout = 0.1  # Dropout rate
output_size = 1  # Output size

# Instantiate the Transformer model
model = TransformerRegressor(input_size=X_train.shape[1], d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, output_size=output_size, dropout=dropout)

# Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training
epochs = 100
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Testing
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    test_mse = mean_squared_error(y_test_tensor.numpy(), predictions.numpy())
    print(f"Test MSE: {test_mse}")





Epoch 1/100, Loss: 1.5311529636383057
Epoch 2/100, Loss: 0.18352815508842468
Epoch 3/100, Loss: 0.8253981471061707
Epoch 4/100, Loss: 0.4678521752357483
Epoch 5/100, Loss: 0.13834364712238312
Epoch 6/100, Loss: 0.6138573884963989
Epoch 7/100, Loss: 6.434743881225586
Epoch 8/100, Loss: 0.20361846685409546
Epoch 9/100, Loss: 0.15003220736980438
Epoch 10/100, Loss: 0.14915168285369873
Epoch 11/100, Loss: 0.10881169885396957
Epoch 12/100, Loss: 0.2408105432987213
Epoch 13/100, Loss: 0.15023601055145264
Epoch 14/100, Loss: 6.617129802703857
Epoch 15/100, Loss: 0.2947746217250824
Epoch 16/100, Loss: 0.205468088388443
Epoch 17/100, Loss: 0.4163501262664795
Epoch 18/100, Loss: 0.07641012966632843
Epoch 19/100, Loss: 0.24723005294799805
Epoch 20/100, Loss: 0.10964818298816681
Epoch 21/100, Loss: 0.3335975706577301
Epoch 22/100, Loss: 7.079642295837402
Epoch 23/100, Loss: 7.1023640632629395
Epoch 24/100, Loss: 0.1038774773478508
Epoch 25/100, Loss: 0.8970283269882202
Epoch 26/100, Loss: 0.271213

In [31]:
import torch
import torch.nn as nn

class Autoencoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, latent_size),
            nn.ReLU(True)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, input_size),
            nn.Sigmoid()  # If the data is normalized between 0 and 1, use Sigmoid
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def encode(self, x):
        return self.encoder(x)
# Training parameters
num_epochs = 50
batch_size = 32
learning_rate = 1e-3

# Data loading
from torch.utils.data import DataLoader, TensorDataset
train_dataset = TensorDataset(X_train_tensor, X_train_tensor)  # Note that autoencoders are unsupervised learning
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Models and Optimizers
model = Autoencoder(input_size=X_train.shape[1], latent_size=32)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training the Autoencoder
model.train()
for epoch in range(num_epochs):
    for data in train_loader:
        inputs, _ = data
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
class RegressionNN(nn.Module):
    def __init__(self, latent_size, output_size):
        super(RegressionNN, self).__init__()
        self.fc = nn.Linear(latent_size, output_size)

    def forward(self, x):
        x = model.encode(x)  # Using the encoding part of the autoencoder
        x = self.fc(x)
        return x

# Initializing the regression model
reg_model = RegressionNN(latent_size=32, output_size=1)
reg_optimizer = torch.optim.Adam(reg_model.parameters(), lr=1e-3)
reg_criterion = nn.MSELoss()

# Training a regression model
reg_model.train()
for epoch in range(50):
    for inputs, targets in train_loader:
        reg_optimizer.zero_grad()
        outputs = reg_model(inputs)
        loss = reg_criterion(outputs, targets)
        loss.backward()
        reg_optimizer.step()
    print(f'Regression training, Epoch [{epoch+1}/50], Loss: {loss.item():.4f}')

# Testing the regression model
reg_model.eval()
with torch.no_grad():
    predictions = reg_model(X_test_tensor)
    test_mse = mean_squared_error(y_test_tensor.numpy(), predictions.numpy())
print(f'Regression test, Epoch',test_mse)
   



Epoch [1/50], Loss: 0.6602
Epoch [2/50], Loss: 0.8381
Epoch [3/50], Loss: 0.6851
Epoch [4/50], Loss: 0.8178
Epoch [5/50], Loss: 0.5978
Epoch [6/50], Loss: 0.5079
Epoch [7/50], Loss: 0.5452
Epoch [8/50], Loss: 0.4260
Epoch [9/50], Loss: 0.4988
Epoch [10/50], Loss: 0.7746
Epoch [11/50], Loss: 0.4792
Epoch [12/50], Loss: 0.9453
Epoch [13/50], Loss: 0.6672
Epoch [14/50], Loss: 0.6306
Epoch [15/50], Loss: 0.3773
Epoch [16/50], Loss: 0.5800
Epoch [17/50], Loss: 0.5053
Epoch [18/50], Loss: 0.3500
Epoch [19/50], Loss: 0.4740
Epoch [20/50], Loss: 0.3627
Epoch [21/50], Loss: 0.4099
Epoch [22/50], Loss: 0.3398
Epoch [23/50], Loss: 0.4707
Epoch [24/50], Loss: 0.3022
Epoch [25/50], Loss: 0.3740
Epoch [26/50], Loss: 0.3892
Epoch [27/50], Loss: 0.7826
Epoch [28/50], Loss: 0.3596
Epoch [29/50], Loss: 0.4740
Epoch [30/50], Loss: 0.3987
Epoch [31/50], Loss: 0.2699
Epoch [32/50], Loss: 0.3109
Epoch [33/50], Loss: 0.3854
Epoch [34/50], Loss: 0.6091
Epoch [35/50], Loss: 0.4199
Epoch [36/50], Loss: 0.3580
E

  return F.mse_loss(input, target, reduction=self.reduction)


Regression training, Epoch [20/50], Loss: 0.5425
Regression training, Epoch [21/50], Loss: 0.8510
Regression training, Epoch [22/50], Loss: 0.7369
Regression training, Epoch [23/50], Loss: 0.4423
Regression training, Epoch [24/50], Loss: 0.4419
Regression training, Epoch [25/50], Loss: 0.6166
Regression training, Epoch [26/50], Loss: 0.8737
Regression training, Epoch [27/50], Loss: 0.5804
Regression training, Epoch [28/50], Loss: 0.5185
Regression training, Epoch [29/50], Loss: 0.5262
Regression training, Epoch [30/50], Loss: 0.6537
Regression training, Epoch [31/50], Loss: 0.6634
Regression training, Epoch [32/50], Loss: 1.0410
Regression training, Epoch [33/50], Loss: 0.5106
Regression training, Epoch [34/50], Loss: 0.4483
Regression training, Epoch [35/50], Loss: 0.7309
Regression training, Epoch [36/50], Loss: 0.4626
Regression training, Epoch [37/50], Loss: 0.5746
Regression training, Epoch [38/50], Loss: 0.6227
Regression training, Epoch [39/50], Loss: 0.4429
Regression training,

##TSNE Confirmation on datset