In [None]:
#imports
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import os.path
import glob

from IMS_functions import load_raw_data, alignment, integrate_spectrum, baseline_correction

from sklearn.preprocessing import StandardScaler            
from sklearn.decomposition import PCA


In [None]:
fermentation = {"ferm3" : "Fermentation 3", "ferm6" : "Fermentation 6", "ferm7" : "Fermentation 7"}

#load metadata and create dict with start and endpoints
metadata = pd.read_excel(r"C:\IMS\metadata_IMS.xlsx", index_col = "fermentation")

exp_numbers =[3,6,7]  

start_dict = {}
for  timestamp, i  in zip([metadata["start_IMS"].loc[nr] for nr in exp_numbers], exp_numbers):
    start_dict["ferm{0}".format(i)]= timestamp

end_dict = {}
for  timestamp, i  in zip([metadata["end_IMS"].loc[nr] for nr in exp_numbers], exp_numbers):  
    end_dict["ferm{0}".format(i)]= timestamp



#load offline measured HPLC ethanol data and save it in dict with key: ferm.Nr. value: DataFrame
path_Off = r'C:\IMS\Offline'             
offline_files = [os.path.join(path_Off,"offline{0}.xlsx".format(i)) for i in exp_numbers]         
offline_dict = {}

for f, i in zip(offline_files, exp_numbers):                        
    offline_dict["ferm{0}".format(i)] = pd.read_excel(f, usecols = [0,4], names =["ts","cE"] )   

#filter according to desired time
for ferm, df in offline_dict.items():
    df = df[(df["ts"] >= start_dict[ferm] ) &  (df["ts"] <= end_dict[ferm])]
    df.set_index("ts", inplace = True)
    offline_dict[ferm] = df
    
offline_merged_df = pd.concat([val for val in offline_dict.values()])


#create a list of list with file paths for each measurement at different days
path3 = r"C:\IMS\IMS_3"
path6 = r"C:\IMS\IMS_6"
path7 = r"C:\IMS\IMS_7"
path_list = [path3, path6, path7]


In [None]:
#load measured IMS data and save each spectrum in a dictionary with key: timestamp, value: DataFrame (2D spectrum)
dat_wide_as_dict = load_raw_data(path_list = path_list ,S_ds = 1, N_ds = 1, start_dict = start_dict,  end_dict = end_dict)  

#Align spectra. Create common drift_time grid, set RIP peak to 1 and relate other peaks to it. 
dat_wide_as_dict = alignment(dat_wide_as_dict)

#Note, if a large ammount of datasets is processed with alignment(), an error can occur. Solution: reduce the ammount of data by downsampling with N_ds or use another end timepoint to read less data.

In [None]:
# # generate static images to show 2D plot
# #If you execute this cell, the rest of the code works very slowly, better do it at the end
# zmax = 1000 # max. value for color scale

# few_ts = [list(dat_wide_as_dict.keys())[i] for i in np.arange(0, len(dat_wide_as_dict), 60)]

# for ts in few_ts:
#     fig = px.imshow(dat_wide_as_dict[ts], labels={'y': 'Retention time [s]', 'x': 'Normalized drift time', 'z' : 'Signal intensity [a.u.]'},               #zaxis?
#                     aspect='auto', title=str(ts), origin='lower', zmin=0, zmax = zmax)
#     fig.show()
#     break
#     if ts >= pd.to_datetime("2020-12-03 12:00") & ts <= pd.to_datetime("2020-12-03 12:30"):
#         break


In [None]:
#integrate at each drift_time over all retention times and save results in a data frame, whereas each row is an integrated 1D spectrum.
col_sum_df = integrate_spectrum(dat_wide_as_dict)

# discard first and last 10% of the spectrum possibly containing interference signals, beneficial for subsequent Baseline correction.
min_max = {"min" : 0.1, "max" : 0.9}
filter = (col_sum_df.columns >= min_max["min"]*np.max(col_sum_df.columns)) & (col_sum_df.columns <= min_max["max"]*np.max(col_sum_df.columns))
col_sum_df =  col_sum_df.loc[:, [col for col in filter]]

In [None]:
#plot without baseline correction 
for t,row in col_sum_df.iterrows():
    fig = px.line(y = row, x = col_sum_df.columns)
    fig.update_xaxes(title_text="Normalized drift time") 
    fig.update_yaxes(title_text="Signal intensity [a.u.]") 
    fig.show()
    break

In [None]:
#baseline correction
col_sum_df = baseline_correction(col_sum_df, "Zhang")

In [None]:
#plot with baseline correction 
for t,row in col_sum_df.iterrows():
    fig = px.line(y = row, x = col_sum_df.columns)
    fig.update_xaxes(title_text="Normalized drift time") 
    fig.update_yaxes(title_text="Signal intensity [a.u.]") 
    fig.show()
    break

In [None]:
PCA_drift_borders = {"start" : 1.02, "end" : 1.18}      #drift_time span considered for PCA,  including the two ethanol peaks

filter = (col_sum_df.columns.values >= PCA_drift_borders["start"] ) &  (col_sum_df.columns.values <= PCA_drift_borders["end"])
col_sum_df_filtered = col_sum_df.iloc[:, [col for col in filter]]

#perform PCA and save PC1 and PC2 values in dataframe 
x = col_sum_df_filtered.values
x = StandardScaler().fit_transform(x) 


pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)      

principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'], index = col_sum_df.index)

fig = px.scatter(principalDf, x="principal component 1", y="principal component 2", color= principalDf.index)    
fig.show()

In [None]:
#plot drift-time section used in PCA
for t,row in col_sum_df_filtered.iterrows():
    fig = px.line(y = row, x = col_sum_df_filtered.columns)
    fig.update_xaxes(title_text="Normalized drift time") 
    fig.update_yaxes(title_text="Signal intensity [a.u.]") 
    fig.show()
    break

In [None]:
#find the closest IMS to HPLC measurements
PCA_dict = {}
for ferm , df in offline_dict.items():
    correlation_indices = [principalDf.index.get_loc(r, 'nearest') for r in df.index.values]
    new = principalDf.iloc[correlation_indices]       
    new["cE"] = df["cE"].values
    new["Fermentation"] = fermentation[ferm]  #vorher ferm
    PCA_dict[ferm] = new

In [None]:
#plot ethanol concentration vs PC1 and PC2 for each experiment
for ferm, df in PCA_dict.items():
    fig = make_subplots(cols = 2)

    df = df[df["cE"] > 0.0] #everything above cE == 0.0

    fig.add_trace(go.Scatter(x= df["principal component 1"] , y= df["cE"], name= "HPLC vs. PC1  " ,  mode = "markers", marker = dict(color =  "green", size = 5), text = df.index), row= 1 , col = 1)
    fig.add_trace(go.Scatter(x= df["principal component 2"], y= df["cE"], name= "HPLC vs. PC2  ", mode = "markers", marker = dict(color =  "red" , size = 5),text = df.index), row= 1 , col = 2)
    fig.update_yaxes(title_text="Ethanol HPLC [g/L]")
    [fig.update_xaxes(title_text= PC, row=1, col= col) for PC, col in zip(["Principal component 1", "Principal component 2"], [1,2])]
    fig.update_layout(title_text = fermentation[ferm])
    fig.update_layout(width=1200)
    fig.show()

In [None]:
#plot ethanol concentration vs PC1 for all experiments
PCA_df = pd.concat([val for val in PCA_dict.values()])
PCA_df = PCA_df[PCA_df["cE"] > 0] #everything above cE == 0.0
fig = px.scatter(PCA_df, x= "principal component 1" , y= "cE", color= "Fermentation", hover_name= PCA_df.index , labels = {"Fermentation" : "Experiment"})  #, labels = {"Fermentation" : "Experiment"}
fig.update_yaxes(title_text="Ethanol HPLC [g/L]")
fig.update_xaxes(title_text="Principal component 1")     
fig.show()

In [None]:
#plot ethanol concentration vs PC2 for all experiments
PCA_df = pd.concat([val for val in PCA_dict.values()])
PCA_df = PCA_df[PCA_df["cE"] > 0] #everything above cE == 0.0
fig = px.scatter(PCA_df, x= "principal component 2" , y= "cE", color= "Fermentation", hover_name= PCA_df.index , labels = {"Fermentation" : "Experiment"})  #, labels = {"Fermentation" : "Experiment"}
fig.update_yaxes(title_text="Ethanol HPLC [g/L]")
fig.update_xaxes(title_text="Principal component 2")     
fig.show()

In [None]:
#statistics to demonstrate how PC1 correlate with measured HPLC ethanol concentration
# Pearson coefficient  
pearson_coeff = PCA_df["principal component 1"].corr(PCA_df["cE"], method = "pearson")

#Spearman coefficient  
spearman_coeff = PCA_df["principal component 1"].corr(PCA_df["cE"], method = "spearman")

print(pearson_coeff, spearman_coeff)

In [None]:
#perform linear regression model for predicting ethanol concentrations 
from sklearn.linear_model import LinearRegression 
from copy import deepcopy

#Training data without ferm3
#no_ferm3_PCA = PCA_df[(PCA_df.index >= pd.to_datetime("2020-12-03 08:00"))]

#comment in or out or in, whether use training data with or without ferm 3
# X = no_ferm3_PCA.loc[: ,["principal component 1"]].values.reshape(-1,1)
# Y = no_ferm3_PCA.loc[: ,["cE"]].values.reshape(-1,1)
#
#comment in or out " 
X = PCA_df.loc[: ,["principal component 1"]].values.reshape(-1,1)
Y = PCA_df.loc[: ,["cE"]].values.reshape(-1,1)
#

lr = LinearRegression()
lr.fit(X,Y)

#predict
X = principalDf.loc[: ,["principal component 1"]].values.reshape(-1,1)
Y_pred = lr.predict(X)
prediction_df = deepcopy(principalDf)
prediction_df["ethanol_IMS"] = Y_pred




filter1 = (prediction_df.index >= pd.to_datetime("2020-11-18 13:37")) & (prediction_df.index <= pd.to_datetime("2020-11-19 13:00"))
filter2 = (prediction_df.index >= pd.to_datetime("2020-12-03 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-04 13:00"))
filter3 = (prediction_df.index >= pd.to_datetime("2020-12-09 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-10 13:00"))

IMS_dict = {"ferm3" : prediction_df[filter1], "ferm6" : prediction_df[filter2], "ferm7" : prediction_df[filter3]}

In [None]:
#plot measured ethanol concentration (HPLC) and predicted ethanol concentration (IMS -> PCA/Linear regression)
line_markers = "lines+markers"
for [ferm, df1], df2 in zip(IMS_dict.items(), offline_dict.values()):

    fig = make_subplots()
    fig.add_trace(go.Scatter(x= df1.index, y= df1["ethanol_IMS"][df1["ethanol_IMS"] >= 0], name = "Ethanol estimated", mode = "lines+markers"))   # stop at ethanol 0 [g/L]
    fig.add_trace(go.Scatter(x= df2.index, y= df2["cE"], name = "Ethanol HPLC", mode = "lines+markers"))
    fig.update_xaxes(title_text="Day/Time")
    fig.update_yaxes(title_text="Ethanol [g/L]")
    fig.update_layout(title_text = fermentation[ferm])
    fig.show()

In [None]:
#conventional feature extraction attempt: finding respective peaks, integrate peak area and correlate it with HPLC measured ethanol concentration. 
from scipy.signal import find_peaks, peak_prominences, peak_widths
from IMS_functions import get_value, integrate_peaks

In [None]:
#with scipys algorythm "find_peaks" it is possible to locate the peak maxima
#just to show outcome the algorythm not necessarily needed in this workflow
peak_list = []
for t, row in col_sum_df.iterrows():      
    peaks = find_peaks(row,  height = 30e3, threshold = 10, distance = 1)
    height = peaks[1]['peak_heights']
    peak_pos = col_sum_df.columns.values[peaks[0]]
    peak_list.append([height, peak_pos])

peak_df = pd.DataFrame(peak_list, columns = ["height", "peak_pos"], index = col_sum_df.index)

line_markers = "lines+markers"
line = "lines"

for i in np.arange(0, len(col_sum_df), 20):     #display only few peaks
    print(i)
    fig = make_subplots()       
    fig.add_trace(go.Scatter(x= col_sum_df.columns, y= col_sum_df.iloc[i], name= str("Signal"), mode = line))
    fig.add_trace(go.Scatter(x= peak_df["peak_pos"].iloc[i], y= peak_df["height"].iloc[i], name= "Peak position", mode = "markers", marker = dict(color =  "red", size = 4)))
    fig.layout["yaxis"].title.text = "Signal intensity [a.u.]"
    fig.layout["xaxis"].title.text = "Normalized drift time"
    fig.update_layout(title_text = str(col_sum_df.index[i]))

    #fig.update_layout(showlegend=False)
    fig.show() 

In [None]:
# filter according to desired range
drift_borders =  {"start" : 0.9, "end" : 1.3}                   
filter = (col_sum_df.columns >= drift_borders["start"] ) &  (col_sum_df.columns <= drift_borders["end"])
col_sum_df_filtered = col_sum_df.iloc[:, [col for col in filter]]


In [None]:
#again create peaks for filtered region

#create dataframe with peak_position, peak_heights, peak_widths and left/right indices 

#peaks
peak_list = []
for t, row in col_sum_df_filtered.iterrows():
    peaks = find_peaks(row,  height = 15e3, threshold = 1, distance = 1)     #height and distance may need to be optimized
    height = peaks[1]['peak_heights']
    peak_index = peaks[0]
    peak_pos = col_sum_df_filtered.columns.values[peaks[0]]
    peak_list.append([peak_pos, height, peak_index, t])

peak_df = pd.DataFrame(peak_list, columns = ["position", "height", "pos_index", "t"]).set_index("t")

#peak start and end
width_list = []
for t, row in col_sum_df_filtered.iterrows():

    widths, h_eval, left_ips_index, right_ips_index = peak_widths(row.values, peak_df["pos_index"].loc[t], rel_height=0.8) #left/right indices at 80% rel peak height
    left_ips =  np.array(col_sum_df_filtered.columns[[np.int(left_ips_index[i]) for i in range(len(left_ips_index))]])
    right_ips =  np.array(col_sum_df_filtered.columns[[np.int(right_ips_index[i]) for i in range(len(right_ips_index))]])
    width_list.append([widths, h_eval, left_ips,  left_ips_index, right_ips, right_ips_index])


width_df = pd.DataFrame(width_list, columns = ["widths", "h_eval", "left_ips", "left_ips_index", "right_ips", "right_ips_index"], index = peak_df.index.values)
peak_df = pd.concat([peak_df, width_df], axis=1)

In [None]:
#search for the left_ips and right_ips values between borders of the ethanol 1 and ethanol 2 peak

#this numbers were taken from a spectrum and may not be sufficiently accurate to describe values at another rel_height than 0.8 
Et_1 = {"start" : 1.026, "end" : 1.075, "left" : 1.034804, "right" : 1.051937}      #borders (start, end) to search within the nearest values (left/right taken) taken from plots by roughly eye determination
Et_2 = {"start" : 1.10, "end" : 1.165, "left" : 1.120467, "right" : 1.146}            #same for ethanol 2 peak


ethanol_list =[]
for t, row in peak_df[["left_ips", "right_ips"]].iterrows():
        
    Et1_left = get_value(row["left_ips"], Et_1["start"], Et_1["end"], Et_1["left"])
    Et1_right = get_value(row["right_ips"], Et_1["start"], Et_1["end"], Et_1["right"])
    Et2_left = get_value(row["left_ips"], Et_2["start"], Et_2["end"], Et_2["left"])
    Et2_right = get_value(row["right_ips"], Et_2["start"], Et_2["end"], Et_2["right"])

    ethanol_list.append([Et1_left, Et1_right, Et2_left, Et2_right])
    
ethanol_peak_df = pd.DataFrame(ethanol_list, columns = ["Et1_left", "Et1_right", "Et2_left", "Et2_right"], index = peak_df.index)

In [None]:
#calculate for each spectrum and each peak the peak_area.

Et1 = [integrate_peaks(col_sum_df_filtered.iloc[i], ethanol_peak_df["Et1_left"].iloc[i], ethanol_peak_df["Et1_right"].iloc[i]) for i in range(len(col_sum_df_filtered))] #area for peak 1
Et2 = [integrate_peaks(col_sum_df_filtered.iloc[i], ethanol_peak_df["Et2_left"].iloc[i], ethanol_peak_df["Et2_right"].iloc[i]) for i in range(len(col_sum_df_filtered))] #area for peak 2
Et1_to_Et2 = [integrate_peaks((col_sum_df_filtered.iloc[i]), ethanol_peak_df["Et1_left"].iloc[i], ethanol_peak_df["Et2_right"].iloc[i]) for i in range(len(col_sum_df_filtered))] #area from start peak1 until end peak 2
x = np.array([Et1, Et2, Et1_to_Et2]).T
ethanol_area_df = pd.DataFrame(x, columns = ["Et1_area", "Et2_area", "Et1_to_Et2"], index = col_sum_df_filtered.index)

In [None]:
#create dictionaries with key: ferm Nr. , value: DataFrame
col_sum_dict_filtered = {}
peak_dict = {}
ethanol_area_dict = {}
for ferm, df in offline_dict.items():
    y = []
    for frame in [col_sum_df_filtered, peak_df, ethanol_area_df]:
        x = [frame.index.get_loc(r, 'nearest') for r in df.index.values]
        x = frame.iloc[x]
        y.append(x)
    col_sum_dict_filtered[ferm], peak_dict[ferm], ethanol_area_dict[ferm] = y[0], y[1], y[2]

for [ferm, df1], df2 in zip(ethanol_area_dict.items(), offline_dict.values()):
    df1["cE"] = df2["cE"].values
    df1["Fermentation"] = fermentation[ferm]
    ethanol_area_dict[ferm] = df1

In [None]:
#just plotting a few to show the considered cutout
few = [2,6]
for ferm, df in col_sum_dict_filtered.items():           
    for i in few:
        fig = px.line(y = df.iloc[i].values, x= df.columns.values,  title=str(df.index[i]))  #range_y=[0, zmax],
        fig.update_xaxes(title_text='Normalized drift time')
        fig.update_yaxes(title_text='Signal intensity [a.u.]')
        fig.show()

In [None]:
#plot data together with peak info
for [ferm, df1], df2 in zip(col_sum_dict_filtered.items(), peak_dict.values()):                 
    few = np.arange(0, len(df1), 50)        # just a few, change last argument in arange to 1 for all
    for i in few:
        fig = make_subplots()
        fig.add_trace(go.Scatter(x= df1.columns.values, y= df1.iloc[i, :], name= "Signal", mode = line))
        fig.add_trace(go.Scatter(x= df2["position"].iloc[i], y= df2["height"].iloc[i], name= "Peak position", mode = "markers", marker = dict(color =  "red", size = 4)))
        fig.add_trace(go.Scatter(x= df2["left_ips"].iloc[i], y= df2["h_eval"].iloc[i], name= "left_w", mode = "markers", marker = dict(color =  "green", size = 5, symbol = "x")))  #, symbol = 141
        fig.add_trace(go.Scatter(x= df2["right_ips"].iloc[i], y= df2["h_eval"].iloc[i], name= "right_W", mode = "markers", marker = dict(color =  "green", size = 5, symbol = "x")))
        
        fig.layout["yaxis"].title.text = "Signal intensity [a.u.]"
        fig.layout["xaxis"].title.text = "Normalized drift time"
        
        fig.update_layout(title_text = str(df1.index[i]))
        fig.update_layout(showlegend=False) #no legend
        fig.show()

In [None]:
#Plotting ethanol conc (HPLC) vs peak area for all experiments
area_vs_conc_df = pd.concat([val for val in ethanol_area_dict.values()])
area_vs_conc_df = area_vs_conc_df[area_vs_conc_df["cE"] > 0] #everything above 0 for HPLC ethanol

#plot for Et 2 peak
fig = px.scatter(area_vs_conc_df, x= "Et2_area" , y= "cE", color= "Fermentation", hover_name= area_vs_conc_df.index, labels={'cE': 'Ethanol HPLC [g/L]', 'Et2_area': 'Area ethanol peak 2 [a.u.]', 'Fermentation' : 'Experiment'}) 

#comment in or out plot for Et1_to_Et2
#for plotting area integrated from left border et1 to right border et2 ... similiar results like PCA
#fig = px.scatter(area_vs_conc_df, x= "Et1_to_Et2" , y= "cE", color= "Fermentation", hover_name= area_vs_conc_df.index, labels={ 'cE' : 'Ethanol HPLC [g/L]', 'Et1_to_Et2': 'Area from ethanol peak 1 to ethanol peak 2 [a.u.]' , 'Fermentation' : 'Experiment'}) 

fig.show()

In [None]:
#statistics to demonstrate how PC1 correlate with measured HPLC ethanol concentration
# Pearson coefficient  
pearson_coeff1 = area_vs_conc_df["Et2_area"].corr(area_vs_conc_df["cE"], method = "pearson")
pearson_coeff2 = area_vs_conc_df["Et1_to_Et2"].corr(area_vs_conc_df["cE"], method = "pearson")

#Spearman coefficient  
spearman_coeff1 = area_vs_conc_df["Et2_area"].corr(area_vs_conc_df["cE"], method = "spearman")
spearman_coeff2 = area_vs_conc_df["Et1_to_Et2"].corr(area_vs_conc_df["cE"], method = "spearman")

print("p1 :" ,pearson_coeff1,"p2 :",pearson_coeff2,"s1 :", spearman_coeff1, "s2 :" , spearman_coeff2)

In [None]:
#Plotting ethanol (HPLC) vs peak area (peak:et1,et2, et1 to et2 ) for each experiment 
for ferm, df in ethanol_area_dict.items():
    df = df[df["cE"] > 0.0] #everything above cE == 0.0
    fig = make_subplots(cols = 3)
    fig.add_trace(go.Scatter(y= df["cE"], x= df["Et1_area"], name= "HPLC vs. Et1_area" ,  mode = "markers", marker = dict(color =  "green", size = 3), text = df.index), row= 1 , col = 1)
    fig.add_trace(go.Scatter(y= df["cE"], x= df["Et2_area"], name= "HPLC vs. Et2_area", mode = "markers", marker = dict(color =  "red", size = 3),text = df.index), row= 1 , col = 2)
    fig.add_trace(go.Scatter(y= df["cE"], x= df["Et1_to_Et2"], name= "HPLC vs. Et1_to_Et2_area", mode = "markers", marker = dict(color =  "blue", size = 3),text = df.index), row= 1 , col = 3) 
    fig.update_yaxes(title_text="Ethanol HPLC [g/L]")
    fig.update_xaxes(title_text="Area ethanol peak 1 [a.u.]", row=1, col=1)
    fig.update_xaxes(title_text="Area ethanol peak 2 [a.u.]",  row=1, col=2)
    fig.update_xaxes(title_text="Area from ethanol peak 1 to ethanol peak 2 [a.u.]",  row=1, col=3) #
    fig.update_layout(title_text = fermentation[ferm])
    fig.update_layout(width=1200)
    fig.show()

In [None]:
#perform linear regression model for predicting ethanol concentrations with ET1_to_Et2 vs HPLC measured ethanol
from sklearn.linear_model import LinearRegression #
from copy import deepcopy


#excluding fermentation 3 from area_vs_conc_df
no_ferm3 = area_vs_conc_df[(area_vs_conc_df.index >= pd.to_datetime("2020-12-03 08:00"))] #

#decide by commenting out either with fermentation 3 or without
# X = area_vs_conc_df.loc[: ,["Et1_to_Et2"]].values.reshape(-1,1) # with ferm 3
# Y = area_vs_conc_df.loc[: ,["cE"]].values.reshape(-1,1) #


#decide by commenting out either with fermentation 3 or without, have to match with X
X = no_ferm3.loc[: ,["Et1_to_Et2"]].values.reshape(-1,1) # without ferm 3
Y = no_ferm3.loc[: ,["cE"]].values.reshape(-1,1) #


#train
lr = LinearRegression()
lr.fit(X,Y)

#predict
X = ethanol_area_df.loc[: ,["Et1_to_Et2"]].values.reshape(-1,1)
Y_pred = lr.predict(X)
prediction_df = deepcopy(ethanol_area_df)
prediction_df["ethanol_predicted"] = Y_pred


filter1 = (prediction_df.index >= pd.to_datetime("2020-11-18 13:37")) & (prediction_df.index <= pd.to_datetime("2020-11-19 13:00"))
filter2 = (prediction_df.index >= pd.to_datetime("2020-12-03 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-04 13:00"))
filter3 = (prediction_df.index >= pd.to_datetime("2020-12-09 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-10 13:00"))

IMS_peak_extraction_dict = {"ferm3" : prediction_df[filter1], "ferm6" : prediction_df[filter2], "ferm7" : prediction_df[filter3]}

In [None]:
#plot linear regression prediction results
for [ferm, df1], df2 in zip(IMS_peak_extraction_dict.items(), offline_dict.values()):
    df1 = df1[df1["ethanol_predicted"] >= -1]
    fig = make_subplots()
    fig.add_trace(go.Scatter(x= df1.index, y= df1["ethanol_predicted"], name = "Ethanol estimated", mode = "lines+markers"))
    fig.add_trace(go.Scatter(x= df2.index, y= df2["cE"], name = "Ethanol HPLC", mode = "lines+markers"))
    fig.update_xaxes(title_text="Day/Time")
    fig.update_yaxes(title_text="Ethanol [g/L]")
    fig.update_layout(title_text = fermentation[ferm])
    fig.show()

In [None]:
#Same for Et_2 area
from sklearn.linear_model import LinearRegression #
from copy import deepcopy


#excluding fermentation 3 from area_vs_conc_df
no_ferm3 = area_vs_conc_df[(area_vs_conc_df.index >= pd.to_datetime("2020-12-03 08:00"))] #

#decide by commenting out either with fermentation 3 or without
#X = area_vs_conc_df.loc[: ,["Et2_area"]].values.reshape(-1,1) # with ferm 3
X = no_ferm3.loc[: ,["Et2_area"]].values.reshape(-1,1) # without ferm 3

#decide by commenting out either with fermentation 3 or without, have to match with X
#Y = area_vs_conc_df.loc[: ,["cE"]].values.reshape(-1,1) #
Y = no_ferm3.loc[: ,["cE"]].values.reshape(-1,1) #

#train
lr = LinearRegression()
lr.fit(X,Y)

#predict
X = ethanol_area_df.loc[: ,["Et2_area"]].values.reshape(-1,1)
Y_pred = lr.predict(X)
prediction_df = deepcopy(ethanol_area_df)
prediction_df["ethanol_predicted"] = Y_pred

filter1 = (prediction_df.index >= pd.to_datetime("2020-11-18 13:37")) & (prediction_df.index <= pd.to_datetime("2020-11-19 13:00"))
filter2 = (prediction_df.index >= pd.to_datetime("2020-12-03 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-04 13:00"))
filter3 = (prediction_df.index >= pd.to_datetime("2020-12-09 08:00")) & (prediction_df.index <= pd.to_datetime("2020-12-10 13:00"))

IMS_peak_extraction_dict = {"ferm3" : prediction_df[filter1], "ferm6" : prediction_df[filter2], "ferm7" : prediction_df[filter3]}

In [None]:
#plot linear regression prediction results
for [ferm, df1], df2 in zip(IMS_peak_extraction_dict.items(), offline_dict.values()):
    df1 = df1[df1["ethanol_predicted"] >= -1]
    fig = make_subplots()
    fig.add_trace(go.Scatter(x= df1.index, y= df1["ethanol_predicted"], name = "Ethanol estimated"))
    fig.add_trace(go.Scatter(x= df2.index, y= df2["cE"], name = "Ethanol HPLC"))
    fig.update_xaxes(title_text="Day/Time")
    fig.update_yaxes(title_text="Ethanol [g/L]")
    fig.update_layout(title_text = fermentation[ferm])
    fig.show()