# Exploring the dataset *Lysekil*

In [1]:
import sys
sys.path.append("../scripts")

import os
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import load_data, preprocessing, visualize
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

# Set Pandas display options to show all columns
pd.set_option('display.max_columns', None)

# Load data
df = pd.read_pickle("../data/processed/renamed_Lysekil.pkl")

# Basic visualization

In [None]:
visualize.dropdown_plot(df)

In [None]:
visualize.qq_plot_all(df.iloc[:, 1:])

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T2"], 
        mode='markers', 
        marker=dict(size=4),
        name='T2 - Desorber inlet',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T3"], 
        mode='markers', 
        marker=dict(size=4),
        name='T3 - Over water wash',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T4"], 
        mode='markers', 
        marker=dict(size=4),
        name='T4 - Under water wash',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T5"], 
        mode='markers', 
        marker=dict(size=4),
        name='T5 - Desorber packing',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T7"], 
        mode='markers', 
        marker=dict(size=4),
        name='T7 - Reboiler',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=30*3.6*df["U7"]/df["F10"], 
        mode='markers', 
        marker=dict(size=4),
        name='SRD',
    )
)

fig.update_layout(title="Temperatures", 
                  legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.02,
                        xanchor="right",
                        x=0.5
                    ),
                  xaxis_title="Time", 
                  yaxis_title="")

fig.show()

# Linear regression (replaced by LinerModel.ipynb)

\begin{equation}
    R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2}
\end{equation}

\begin{equation}
    RMSE = \sqrt{\frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n}}
\end{equation}

The $R^2$ value represents the proportion of the variance in the dependent variable that is predictable from the independent variables.  

RMSE is a measure of the differences between values predicted by a model and the values actually observed. It is the square root of the average of the squared differences between prediction and actual observation. 

$y_i$: the observed values

$\hat{y}_i$: the predicted values

$\bar{y}$: the mean of the observed values.

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score

def makeLinReg(X_train, y_train, cols, return_model=False, **kwargs):
    """
    Perform linear regression on the given training data.

    Parameters
    ----------
    X_train : pandas.Dataframe
        The training data features.
        
    y_train : pandas.Series
        The training data target variable.
        
    cols : list
        The columns to use for regression.
        
    return_model : obj, default=False
        Whether to return the regression model as an object.

    Returns
    ----------
    R2 : float
        The R2-score of the regression model.
        
    reg : obj, default=None
        The regression model object.
    
    Examples
    --------
    >>> makeLinReg(X_train, y_train, ["F10"])
    0.65543620879171
    """
    reg = LinearRegression(**kwargs)
    X_train_reshaped = X_train[cols].to_numpy().reshape(-1, len(cols))
    reg.fit(X_train_reshaped, y_train)
    score = reg.score(X_train_reshaped, y_train)
    if return_model:
        return score, reg
    
    return score

def performKFold(X, y, model, cols, splits=5, print_results=False, **kwargs):
    """
    Perform K-fold cross-validation for regression models.

    Parameters
    ----------
    X : pandas.DataFrame
        The feature matrix.
        
    y: pandas.Series
        The target variable.
        
    model : function
        The regression model function. Output should be a tuple containing 
        the R2-score and RMSE.
        
    cols : list
        The list of column names to use in the regression model.
        
    args : list, default=None
        The list of arguments to pass to the model function.
        
    splits : int, default=5
        The number of splits for K-fold cross-validation.
    
    print_results : bool, default=False
        Whether to print the results

    Returns
    ----------
    loss : tuple
        A tuple containing the average R2 score and average RMSE across all 
        folds.
        
    Examples
    --------
    >>> performKFold(X_train, y_train, makeLinReg, ["F10"], splits=5, print_results=True)
    R2-scores: [0.26, 0.27, 0.26, 0.26, 0.27]
    RMSEs: [8.2, 8.17, 8.2, 8.37, 8.2]
    Average R2-score: 0.26
    Average RMSE: 8.23
    """
    cv = KFold(n_splits=splits, random_state=0, shuffle=True)
    R2s, RMSEs = [], []
    for train_index, test_index in cv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        R2, reg = model(X_train, y_train, cols, return_model=True, **kwargs)
        y_pred = reg.predict(X_test[cols].values)
        RMSE = mean_squared_error(y_test, y_pred, squared=False)
        R2s.append(np.round(R2, 2))
        RMSEs.append(np.round(RMSE, 2))

    avg_R2 = sum(R2s) / len(R2s)
    avg_RMSE = sum(RMSEs) / len(RMSEs)
    if print_results:
        print(f"R2-scores: {R2s}")
        print(f"RMSEs: {RMSEs}")
        print(f"Average R2-score: {avg_R2:.2f}")
        print(f"Average RMSE: {avg_RMSE:.2f}")
        
    return avg_R2, avg_RMSE

def forward_selection(X_train, y_train, model, kwargs={}, splits=5):
    """
    Find the optimal number of variables for a regression model using 
    forward selection.

    Parameters
    ----------
    X_train : pandas.DataFrame
        The training data features.
        
    y_train : pandas.Series
        The training data target variable.
        
    model : function
        The regression model function.
        
    splits : int, default=5
        The number of splits for K-fold cross-validation.

    Returns
    ----------
    optimal_cols : list
        A list of the optimal variables to use in the regression model.
        
    best_R2s : list
        A list of the best R2-scores for each iteration.
        
    best_RMSEs : list
        A list of the best RMSEs for each iteration.
        
    Examples
    --------
    >>> find_optimal_variables(X_train, y_train, makeLinReg, splits=5)
    ['F10', 'D1', 'T10', 'T9', 'T5']
    """
    cols = [col for col in X_train.columns if col != "Time"]
    optimal_cols, best_R2s, best_RMSEs = [], [], []
    while cols:
        performance_metrics = []
        for col in cols:
            var = optimal_cols + [col]
            R2, RMSE = performKFold(X_train, y_train, 
                                    model, var, splits=splits, **kwargs)
            performance_metrics.append((R2, RMSE, col))

        best_R2, best_RMSE, best_col = max(performance_metrics, 
                                           key=lambda x: x[0])
        best_R2s.append(best_R2)
        best_RMSEs.append(best_RMSE)
        optimal_cols.append(best_col)
        cols.remove(best_col)
        
    return optimal_cols, best_R2s, best_RMSEs

In [None]:
# Split the data into X and y
y_list = ["U7"]
X = new_df.drop(y_list, axis=1)
y = new_df[y_list]

# Split the data into training and test sets. Leave test till the end
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=0,
                                                    shuffle=True)



# Scale the data
cols = [col for col in X_train.columns if col != 'Time']
scaler = StandardScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])

# Arguments for LinearRegression() inside makeLinReg()
kwargs = {}

# Find the optimal columns for the linear regression model
optimal_cols, best_R2s, best_RMSEs = forward_selection(X_train, 
                                                       y_train, 
                                                       makeLinReg,
                                                       kwargs=kwargs,
                                                       splits=10)    
print(optimal_cols)

In [None]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(
        x=list(range(1, len(best_R2s)+1)), 
        y=best_R2s, 
        name="R<sup>2",
        marker=dict(size=8)
        ),
    secondary_y=False
)

fig.add_trace(
    go.Scatter(
        x=list(range(1, len(best_RMSEs)+1)), 
        y=best_RMSEs, 
        name="RMSE",
        marker=dict(size=8)
        ),
    secondary_y=True,
)

# Set y-axes titles
fig.update_yaxes(title_text="R<sup>2", secondary_y=False)
fig.update_yaxes(title_text="RMSE", secondary_y=True)

# Display the plot
fig.update_layout(
    xaxis_title="Number of Features",
    title="Ordinary Linear Regression",
    height=500,
    width=1000,
    legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=0.15
            ),
    template="simple_white",
)

fig.show()

In [None]:
score, linreg = makeLinReg(X_train, y_train, optimal_cols[:12], return_model=True)
coef = linreg.coef_
intercept = linreg.intercept_

y_pred = X_train[optimal_cols[:12]].dot(coef.T) + intercept

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=X_train["Time"], 
        y=y_train.squeeze(), 
        mode='markers', 
        marker=dict(size=4),
        name='Actual',
    )
)

fig.add_trace(
    go.Scatter(
        x=X_train["Time"], 
        y=y_pred.squeeze(), 
        mode='markers', 
        marker=dict(size=4),
        name='Prediction',
    )
)

fig.update_layout(title="Modelling the reboiler duty using linear regression", 
                  legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.02,
                        xanchor="right",
                        x=0.15
                    ),
                  xaxis_title="Time", 
                  yaxis_title="Reboiler duty [MW]")

fig.show()

In [None]:
score, linreg = makeLinReg(X_train, y_train, optimal_cols[:12], return_model=True)

cols = [col for col in X if col != 'Time']
X_scaled = scaler.transform(X_test[cols])   # all 15 variables
X_df = pd.DataFrame(X_scaled, columns=cols) # all 15 variables as df
X_df_optimal = X_df[optimal_cols[:12]]      # optimal 12 variables as df
y_pred = linreg.predict(X_df_optimal.values)

print(mean_squared_error(y_test, y_pred, squared=False))
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=X_test["Time"], 
        y=y_test.squeeze(), 
        mode='markers', 
        marker=dict(size=4),
        name='Testing data',
    )
)

fig.add_trace(
    go.Scatter(
        x=X_test["Time"], 
        y=y_pred.squeeze(), 
        mode='markers', 
        marker=dict(size=4),
        name='Prediction',
    )
)

fig.update_layout(title="Modelling the reboiler duty using linear regression", 
                  legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.02,
                        xanchor="right",
                        x=0.15
                    ),
                  xaxis_title="Time", 
                  yaxis_title="Reboiler duty [MW]")

fig.show()

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.scatter(y_test, y_pred)

# Add labels and title
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Parity Plot')

plt.show()


In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

def makeRidgeReg(X_train, y_train, cols, return_model=False, **kwargs):
    """
    Perform ridge regression on the given training data.

    Parameters
    ----------
    X_train : pandas.Dataframe
        The training data features.
        
    y_train : pandas.Series
        The training data target variable.
        
    cols : list
        The columns to use for regression.
        
    return_model : obj, default=False
        Whether to return the regression model.
        
    kwargs : dict
        Additional keyword arguments to pass to the Ridge() function.

    Returns
    ----------
    R2 : tuple
        A tuple containing the R2 score
        
    reg : obj, default=None
        The regression model object.
    
    Examples
    --------
    >>> makeRidgeReg(X_train, y_train, ["F10"])
    0.655436203278172
    """
    reg = Ridge(**kwargs)
    X_train_reshaped = X_train[cols].to_numpy().reshape(-1, len(cols))
    reg.fit(X_train_reshaped, y_train)
    score = reg.score(X_train_reshaped, y_train)
    if return_model:
        return score, reg
    
    return score

R2_ridge = {}
RMSE_ridge = {}
params = {"alpha": [0.1, 0.2177, 0.3677, 1, 10, 100]}

for alpha in params["alpha"]:
    kwargs = {"alpha": alpha}
    optimal_cols, best_R2s, best_RMSEs = forward_selection(X_train, 
                                                           y_train, 
                                                           makeRidgeReg,
                                                           kwargs=kwargs,
                                                           splits=10)
    R2_ridge[alpha] = best_R2s
    RMSE_ridge[alpha] = best_RMSEs

In [None]:
# Create dataframe for R2 scores
R2_ridge_df = pd.DataFrame(R2_ridge).T
R2_ridge_df

In [None]:
# Create dataframe for RMSE scores
RMSE_ridge_df = pd.DataFrame(RMSE_ridge).T
RMSE_ridge_df

# Trying to remove data points using DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Select the column to analyze
col = "P6"
column_to_analyze = X_train[col].values.reshape(-1, 1)

# Scale the column (DBSCAN is sensitive to distances)
scaler = StandardScaler()
column_scaled = scaler.fit_transform(column_to_analyze)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.01, min_samples=100)  # Adjust the parameters according to your specific needs
clusters = dbscan.fit_predict(column_scaled)

# Identify outliers (points labeled as -1)
outlier_mask = (clusters == -1)

# Remove outliers from the original DataFrame
df_no_outliers = X_train[~outlier_mask]
df_no_outliers[col] -= 1

fig = go.Figure()

fig.add_trace(go.Scatter(x=X_train["Time"], 
                        y=X_train[col],
                    name="Original", mode='markers', marker=dict(size=3)))

fig.add_trace(go.Scatter(x=df_no_outliers["Time"], 
                        y=df_no_outliers[col],
                    name="With outliers removed", mode='markers', marker=dict(size=3)))

fig.show()

print(len(outlier_mask))
print("Original data points:", len(X_train.index))
print("With outliers removed:", len(df_no_outliers.index))

In [None]:
col1 = "P2"

df_no_outliers[col1] -= 1
fig = go.Figure()

fig.add_trace(go.Scatter(x=X_train["Time"], 
                        y=X_train[col1],
                    name="Original", mode='markers', marker=dict(size=3)))

fig.add_trace(go.Scatter(x=df_no_outliers["Time"], 
                        y=df_no_outliers[col1],
                    name="With outliers removed", mode='markers', marker=dict(size=3)))

fig.show()

### Trial 2

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Select the columns to analyze (all except the first one)
columns_to_analyze = X_train.iloc[:, 1:].values

# Scale the data (DBSCAN is sensitive to distances)
scaler = StandardScaler()
columns_scaled = scaler.fit_transform(columns_to_analyze)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.4, min_samples=5)  # Adjust the parameters according to your specific needs
clusters = dbscan.fit_predict(columns_scaled)

# Identify outliers (points labeled as -1)
outlier_mask2 = (clusters == -1)

# Remove outliers from the original DataFrame
df_no_outliers2 = X_train[~outlier_mask2]

# Print some information about the results
print("Number of outliers:", sum(outlier_mask2))
print("Original data points:", len(X_train.index))


col1 = "P6"
df_no_outliers2[col1] -= 1
fig = go.Figure()

fig.add_trace(go.Scatter(x=X_train["Time"], 
                        y=X_train[col1],
                    name="Original", mode='markers', marker=dict(size=3)))

fig.add_trace(go.Scatter(x=df_no_outliers2["Time"], 
                        y=df_no_outliers2[col1],
                    name="With outliers removed", mode='markers', marker=dict(size=3)))

fig.show()

print(len(outlier_mask2))
print("Original data points:", len(X_train.index))
print("With outliers removed:", len(df_no_outliers2.index))

In [None]:
#col1 = "Downstream-Rich-TT1043"
col1 = "T8"


df_no_outliers2[col1] += 40
fig = go.Figure()

fig.add_trace(go.Scatter(x=X_train["Time"], 
                        y=X_train[col1],
                    name="Original", mode='markers', marker=dict(size=3)))

fig.add_trace(go.Scatter(x=df_no_outliers2["Time"], 
                        y=df_no_outliers2[col1],
                    name="With outliers removed", mode='markers', marker=dict(size=3)))

fig.show()

# Cross-correlation

In [None]:
import numpy as np
import pandas as pd

def calculate_time_delays(df, time_column):
    """
    Calculates the time delay between each pair of variables in the DataFrame, excluding the specified time column.
    :param df: A pandas DataFrame with each column being a time series variable, including a time column.
    :param time_column: The name of the column containing time stamps that should be excluded from the analysis.
    :return: A DataFrame with the time delays between each pair of variables.
    """
    # Exclude the time column from the analysis
    df_without_time = df.drop(columns=[time_column])
    
    time_delays = pd.DataFrame(index=df_without_time.columns, columns=df_without_time.columns, dtype=int)
    for col1 in df_without_time.columns:
        for col2 in df_without_time.columns:
            if col1 != col2:
                # Compute cross-correlation
                cross_corr = np.correlate(df_without_time[col1] - df_without_time[col1].mean(), df_without_time[col2] - df_without_time[col2].mean(), mode='full')
                
                # Find the lag where the cross-correlation is maximized
                lag = np.argmax(cross_corr) - (len(df_without_time[col1]) - 1)
                
                # Store the time delay
                time_delays.at[col1, col2] = lag
            else:
                # No time delay for the same variable
                time_delays.at[col1, col2] = 0
    return time_delays

# Example usage:
# Assume 'df' is your pandas DataFrame with the time series data, including a 'Time' column.
# df = pd.read_csv('your_dataset.csv')  # Example to load your dataset

# Calculate the time delays, excluding the 'Time' column
time_delays = calculate_time_delays(df, 'Time')
print(time_delays)


# Finding time delay

In [None]:
scaler = StandardScaler()

df = pd.read_pickle("../data/processed/SRD_Lysekil.pkl")
df.describe()

In [None]:
df = df.drop(columns=["Time"], axis=1)
df[df.columns] = scaler.fit_transform(df)
df.describe()

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

df = pd.read_pickle("../data/processed/SRD_Lysekil.pkl")
df_scaled = df#[df['Time'] < '2020-05-26']

#scaler = RobustScaler()
#df_scaled.iloc[:, 1:] = scaler.fit_transform(df_scaled.iloc[:, 1:])

var1 = "T4"
var2 = "T5"
var3 = "T7"
var4 = "SRD"

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_scaled["Time"], y=df_scaled[var1], mode='markers', name=var1, marker=dict(size=2)))
fig.add_trace(go.Scatter(x=df_scaled["Time"], y=df_scaled[var2], mode='markers', name=var2, marker=dict(size=2)))
fig.add_trace(go.Scatter(x=df_scaled["Time"], y=df_scaled[var3], mode='markers', name=var3, marker=dict(size=2)))
fig.add_trace(go.Scatter(x=df_scaled["Time"], y=df_scaled[var4]*20, mode='markers', name=var4, marker=dict(size=2)))
fig.show()

# More EDA

In [None]:
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load your DataFrame
df = pd.read_pickle("../data/processed/SRD_Lysekil.pkl")
features = df.drop(columns=['Time', 'Status'])

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Initialize and fit UMAP
reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(scaled_features)

# Convert 'Status' to categorical codes
status_labels = df['Status'].astype('category')
categories = status_labels.cat.categories
coded_labels = status_labels.cat.codes

# Plotting
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=coded_labels, cmap='coolwarm', alpha=0.7)

# Correcting colorbar ticks and labels
cbar = plt.colorbar(scatter, ticks=np.arange(len(categories)))
cbar.set_ticklabels(categories)

plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the Dataset', fontsize=18)
plt.show()


# K-means clustering

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming `df` is your DataFrame and it contains a 'Time' column and other numerical columns
df = pd.read_pickle("../data/processed/renamed_Lysekil.pkl")

# Step 1: Preprocess the data
# Remove the 'Time' column for clustering
X = df.drop('Time', axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply K-Means Clustering
# Initialize KMeans with 2 clusters (assuming we want to classify as 'Transient' or 'Stable')
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_scaled)

# Step 3: Assign the labels back to the DataFrame
# The labels_ attribute of KMeans will give you the cluster index for each sample
df['Category'] = kmeans.labels_

# Optionally, if you know which cluster corresponds to 'Transient' or 'Stable', you can replace labels
# For example, if cluster 0 corresponds to 'Stable' and cluster 1 to 'Transient', you can map them like this:
df['Category'] = df['Category'].map({0: 'Stable', 1: 'Transient'})

# Check the first few rows to verify
df['Category'].describe()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.cluster import KMeans
# HDBSCAN could be a better choice after UMAP, but here we stick with KMeans for simplicity
# import hdbscan

# Assuming `df` is your DataFrame
df = pd.read_pickle("../data/processed/renamed_Lysekil.pkl")
# Step 1: Preprocess the data
# Remove the 'Time' column for clustering
X = df.drop('Time', axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply UMAP for dimensionality reduction
umap_reducer = umap.UMAP(random_state=42)
X_reduced = umap_reducer.fit_transform(X_scaled)

# Step 3: Apply K-Means Clustering on reduced data
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_reduced)

# Step 4: Assign the labels back to the DataFrame
df['Category'] = kmeans.labels_

# Optionally, map the cluster labels to 'Transient' and 'Stable'
df['Category'] = df['Category'].map({0: 'Stable', 1: 'Transient'})

# Check the first few rows to verify
print(df.head())


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_pickle("../data/processed/renamed_Lysekil.pkl")
X = df.drop('Time', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

umap_reducer = umap.UMAP(random_state=42)
X_reduced = umap_reducer.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=13, random_state=42)
kmeans.fit(X_reduced)
df['Cluster'] = kmeans.labels_

plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Cluster'], palette="viridis", s=50, alpha=0.6)
plt.title('UMAP Projection with K-Means Clustering')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.legend(title='Cluster')
plt.show()

In [None]:
cluster_counts = df['Cluster'].value_counts()
cluster_counts

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming `df` is your DataFrame and it contains a 'Time' column and other numerical columns

# Step 1: Preprocess the data
# Remove the 'Time' column for clustering
X = df.drop('Time', axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply UMAP for dimensionality reduction
umap_reducer = umap.UMAP(random_state=42)
X_reduced = umap_reducer.fit_transform(X_scaled)

# Step 3: Apply HDBSCAN Clustering on reduced data
hdbscan_cluster = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=2, cluster_selection_epsilon=0.5)
hdbscan_cluster.fit(X_reduced)
df['Cluster'] = hdbscan_cluster.labels_

# Find the number of clusters identified (excluding noise, if present)
num_clusters = len(set(hdbscan_cluster.labels_)) - (1 if -1 in hdbscan_cluster.labels_ else 0)
print(f"Number of clusters found: {num_clusters}")

# Step 4: Plot the 2D projection with clusters
plt.figure(figsize=(10, 8))
# Use a palette that can handle the -1 label for noise, if it exists
palette = sns.color_palette('viridis', as_cmap=True)
sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=df['Cluster'], palette=palette, s=50, alpha=0.6)
plt.title('UMAP Projection with HDBSCAN Clustering')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.show()

In [None]:
cluster_counts = df['Cluster'].value_counts()
cluster_counts.sort_values(ascending=False)

In [None]:
cluster1 = df[df['Cluster'] == 73]
cluster2 = df[df['Cluster'] == -1]

var = "F1"

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df['Time'], 
        y=df[var], 
        mode='markers', 
        marker=dict(size=2, opacity=0.4),
        name='Original data',
    )
)

fig.add_trace(
    go.Scatter(
        x=cluster1['Time'], 
        y=cluster2[var], 
        mode='markers', 
        marker=dict(size=2),
        name='Cluster 1',
    )
)

fig.add_trace(
    go.Scatter(
        x=cluster2['Time'], 
        y=cluster2[var], 
        mode='markers', 
        marker=dict(size=2),
        name='Cluster 2',
    )
)

fig.show()

## Checking T3, T4, T5

In [12]:
df.describe()


Unnamed: 0,F1,D1,T1,P2,T2,T3,T4,T5,P6,T7,F8,D8,T9,T10,SRD
count,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0,21894.0
mean,1.825203,1095.48877,46.929695,1.444231,106.810997,93.529877,101.91877,104.214111,0.885619,119.218117,2.130471,1027.800781,71.37011,25.685984,3.891976
std,0.316277,6.166871,3.147577,0.10434,1.959276,3.70884,1.952656,3.554724,0.080953,1.763365,0.272815,7.38758,5.35615,6.300364,0.961009
min,0.067001,1041.672363,23.370068,0.839493,71.6353,14.843035,51.803734,69.266319,0.291982,81.412819,0.357544,1009.221375,30.040361,11.346413,0.070981
25%,1.64103,1090.596527,44.944853,1.43436,105.986967,92.290932,101.340199,102.667849,0.898906,118.961432,1.978549,1022.99762,69.01819,18.892883,3.771261
50%,1.812628,1095.373962,46.48996,1.448445,106.550156,93.717636,101.918541,103.724018,0.900358,119.860928,2.068832,1027.149475,72.464329,29.498544,3.832877
75%,2.065185,1099.678406,48.348411,1.487149,108.116116,95.107573,102.790236,105.817825,0.901791,120.011168,2.311112,1033.097015,75.242456,30.167948,3.901071
max,2.557114,1114.453491,61.029945,2.238056,112.460258,113.577332,116.803635,118.533478,1.503647,127.563393,2.930947,1093.196899,100.083435,46.28294,92.789858


In [11]:
df = pd.read_pickle("../data/processed/SRD_Lysekil.pkl")

count = df[df['T7'] < df['T5']].shape[0]
print(100*count/df.shape[0])

count = df[df['T5'] < df['T4']].shape[0]
print(count, 100*count/df.shape[0])

count = df[df['T4'] < df['T3']].shape[0]
print(100*count/df.shape[0])

noe = df[df['Status'] == "Stable"]
count = noe[noe['T5'] < noe['T4']].shape[0]
print(count, 100*count/noe.shape[0])

0.0
4687 21.407691605005937
0.0
3799 24.346321456036915


In [7]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T2"], 
        mode='markers', 
        marker=dict(size=4),
        name='T2 - Desorber inlet',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T3"], 
        mode='markers', 
        marker=dict(size=4),
        name='T3 - Over water wash',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T4"], 
        mode='markers', 
        marker=dict(size=4),
        name='T4 - Under water wash',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T5"], 
        mode='markers', 
        marker=dict(size=4),
        name='T5 - Desorber packing',
    )
)

fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=df["T7"], 
        mode='markers', 
        marker=dict(size=4),
        name='T7 - Reboiler',
    )
)

"""fig.add_trace(
    go.Scatter(
        x=df["Time"], 
        y=30*3.6*df["U7"]/df["F10"], 
        mode='markers', 
        marker=dict(size=4),
        name='SRD',
    )
)"""

fig.update_layout(title="Temperatures", 
                  legend=dict(
                        orientation="h",
                        yanchor="bottom",
                        y=1.02,
                        xanchor="right",
                        x=0.5
                    ),
                  xaxis_title="Time", 
                  yaxis_title="")

fig.show()