In [0]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
import plotly.express as px
from scipy.stats import iqr 
from itertools import permutations,product
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime
from dateutil import relativedelta
import warnings
warnings.filterwarnings("ignore")
import time

In [0]:
os.getcwd()
os.chdir("/dbfs/mnt/datalake/stg/dcs")

In [0]:
def CreateDataSet(DataFilePath,EnergyFilePath):
  
  
  """
  Function to create primary data by combining DCS and NCV data.
  Input : NCV data path & DCS data path
  Output : Combined pandas dataframe
  """
  
    dcs_265 = pd.read_csv(DataFilePath)
    dcs_265["DateTime"] = pd.to_datetime(dcs_265["DateTime"])
    dcs_265["DateTime_IST"] = dcs_265["DateTime"].apply(lambda x: x + pd.Timedelta(330, unit = "m"))
    dcs_265["Date"] = dcs_265["DateTime_IST"].apply(lambda x: str(x.date()))
    dcs_265["Time"] = dcs_265["DateTime_IST"].apply(lambda x: str(x.time()))
    dcs_265 = pd.concat([dcs_265[dcs_265.Date < "2020-03-24"],dcs_265[dcs_265.Date > "2020-04-15"]],axis = 0).dropna()
    dcs_265 = dcs_265.sort_values(by = "DateTime", ascending = True)
    
    NGCV = pd.read_csv(EnergyFilePath)
    NGCV_265 = NGCV.loc[NGCV.Plant == "265 TPD",["Date","Time","NCV Value"]]
    dcs_265["minmarker"] = dcs_265["Time"].apply(lambda x: x[:6] + "00")
    dcs_265 = dcs_265.merge(NGCV_265, how = "left" , left_on = ["Date", "minmarker"], right_on=["Date","Time"])
    dcs_265['NCV Value'] = dcs_265['NCV Value'].fillna(method = "ffill")
    dcs_265 = dcs_265.drop(["minmarker","Time_y"],axis = 1)
    
    return dcs_265

In [0]:
def CreateModelVariables(dataset):
    
  """
  Function to create main DCS variables and column name change.
  Input : pandas dataframe from previous function
  Output : Revised pandas dataframe with new variables and column names changed
  """
    
    dataset["AirFlow"] = dataset[["CABFT211AI12","CABFT212AI13"]].apply(max,axis=1)
    dataset = dataset[["DateTime","Date","Time_x","TE204TC22","AirFlow","MGFT221AI18","FPT211AI23","TE207TC25","TE101TC10","NCV Value"]]
    dataset.columns = ["DateTime","Date","Time","MelterCrown","AirFlow","FuelFlow","FurnacePressure","MelterBottom","FlueTemperature","NCV Value"] 
    dataset["air_to_fuel"] = dataset["AirFlow"]/dataset["FuelFlow"]
    dataset["year"] = dataset["Date"].apply(lambda x: x[:4])
    dataset["month"] = dataset["Date"].apply(lambda x: x[5:7])
    dataset["yearmonth"] = dataset[["year","month"]].apply(lambda x: int(str(x[0])+str(x[1])), axis = 1)

    return dataset

In [0]:
def IdentifyOutlier(date,dataset):
  
  """
  Function to identify outliers in FuelFlow.
  Input : Date and pandas dataframe
  Output : Outlier bin against each date
  """

    tmp = dataset.loc[dataset.Date == date,["FuelFlow"]]
    tmp["bins"] = pd.cut(tmp["FuelFlow"],10)
    return (tmp["bins"].value_counts().index[0])


In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,SGDRegressor,Ridge,Lasso
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [0]:
def CreateModel(train,train_lag,dataset):

  
  
  """
  Main modelling function that creates modelling variables and performs the modelling
  Input : Pandas dataframe & training period input
  Output : Coefficients of the linear regression model
  """

    tmp = dataset[dataset.Outlier == 0].dropna()
    tmp = tmp.sort_values("DateTime", ascending = True)
    tmp["Hour"] = tmp["DateTime"].apply(lambda x: x.hour)
    tmp["Minute"] = tmp["DateTime"].apply(lambda x: x.minute)
    tmp["MinuteMarker"] = tmp["Minute"].apply(lambda x: x//30)

    tmp = tmp.groupby(["Date","Hour","MinuteMarker"], as_index = False).agg({"Time" : max,"yearmonth":"mean","MelterCrown":"mean", "FuelFlow" : "mean"
                                                                             , "NCV Value" : "mean", "MelterBottom" : "mean", "FlueTemperature" : "mean"
                                                                             ,"AirFlow" : "mean" })
    
    
    dcs_train = tmp[(tmp.yearmonth <= train)].dropna()
    dcs_train["del_FF"] = dcs_train["FuelFlow"].diff(1)
    dcs_train["del_NCV"] = dcs_train["NCV Value"].diff(1)
    dcs_train["del_Temp"] = dcs_train["MelterCrown"].diff(1)
    dcs_train["del_AirFlow"] = dcs_train["AirFlow"].diff(1)



    dcs_test = tmp[(tmp.yearmonth == (train+1))].dropna()
    dcs_test["del_FF"] = dcs_test["FuelFlow"].diff(1)
    dcs_test["del_NCV"] = dcs_test["NCV Value"].diff(1)
    dcs_test["del_Temp"] = dcs_test["MelterCrown"].diff(1)
    dcs_test["del_AirFlow"] = dcs_test["AirFlow"].diff(1)

    dcs_train = dcs_train.dropna()
    dcs_test = dcs_test.dropna()
    
    lm_model_FF = LinearRegression(fit_intercept=True).fit(X = dcs_train[["del_NCV","del_Temp","del_AirFlow"]]
                                  , y = dcs_train[["del_FF"]])
    print("FF Model : ",r2_score(dcs_train[["del_FF"]],lm_model_FF.predict(dcs_train[["del_NCV","del_Temp","del_AirFlow"]])))


    lm_coef = pd.DataFrame({"Variable" : ["del_NCV","del_Temp","del_AirFlow"]
                        ,"Coef" : lm_model_FF.coef_[0]})
    intercept = pd.DataFrame({"Variable" : ["intercept"],"Coef" : lm_model_FF.intercept_[0]})
    
    lm_coef = pd.concat([lm_coef,intercept], axis = 0).reset_index(drop = True)
    
    return(lm_coef)


In [0]:
dcs_temps = CreateDataSet(DataFilePath= "/dbfs/mnt/datalake/master/dcs/master_dcs_255_tpd.csv",EnergyFilePath= "/dbfs/mnt/datalake/exploratory/Furnace_Analytics/265TPD/265_EnergyRecords.csv")

dcs_temps = CreateModelVariables(dcs_temps)
tmp = dcs_temps[["Date"]].drop_duplicates()
tmp["OutlierBin"] = tmp["Date"].apply(lambda x: IdentifyOutlier(date = x, dataset = dcs_temps))
dcs_temps = dcs_temps.merge(tmp,how = "left",on = "Date")
dcs_temps["Outlier"] = dcs_temps[["FuelFlow","OutlierBin"]].apply(lambda x: 0 if x[0] in x[1] else 1,axis = 1)
del(tmp)
dcs_temps.head(10)

tmp = CreateModel(train = 202008,train_lag = 1,dataset = dcs_temps)
tmp

In [0]:
def CreateTestData(DataFilePath,EnergyFilePath,Last30M = "Y"):
  
  """
  Function for preparing the test data for predictions.
  Input : File paths for both, NCV and DCS data along with marker to pickup only last 30M data
  Output : Cleaned pandas dataframe with basic variables
  """

  
  
  DCS = pd.read_csv(DataFilePath)
  NCV = pd.read_csv(EnergyFilePath)
  
  DCS["DateTime"] = pd.to_datetime(DCS["DateTime"])
  DCS["DateTime_IST"] = DCS["DateTime"].apply(lambda x: x + pd.Timedelta(330, unit = "m"))
  DCS["Date"] = DCS["DateTime_IST"].apply(lambda x: str(x.date()))
  DCS["Time"] = DCS["DateTime_IST"].apply(lambda x: str(x.time()))
  DCS["minmarker"] = DCS["Time"].apply(lambda x: x[:6] + "00")
  DCS = DCS.sort_values(by = "DateTime_IST", ascending = True)
  DCS["AirFlow"] = DCS[["CABFT211AI12","CABFT212AI13"]].apply(max,axis=1)


  NCV["DateTime"] = pd.to_datetime(NCV["From"])
  NCV["Date"] = NCV["DateTime"].apply(lambda x: str(x.date()))
  NCV["Time"] = NCV["DateTime"].apply(lambda x: str(x.time()))
  NCV = NCV[["Date","Time","STREAM2_INFERIOR_CV_NCV_METER_JBR"]]
  NCV = NCV.rename({"STREAM2_INFERIOR_CV_NCV_METER_JBR":"NCV"}, axis = 1)

  DCS = DCS.merge(NCV, how = "left" , left_on = ["Date", "minmarker"], right_on=["Date","Time"])
  DCS["NCV"] = DCS["NCV"].fillna(method = "ffill")
  DCS = DCS[["DateTime_IST","Date","Time_x","TE204TC22","AirFlow","MGFT221AI18","FPT211AI23","TE207TC25","TE101TC10","NCV"]]
  DCS.columns = ["DateTime","Date","Time","MelterCrown","AirFlow","FuelFlow","FurnacePressure","MelterBottom","FlueTemperature","NCV Value"] 

  if Last30M == "N":
    
    DCS["Bin"] = DCS["Date"].apply(lambda x: IdentifyOutlier(date = x, dataset = DCS))
    DCS["Outlier"] = DCS[["FuelFlow","Bin"]].apply(lambda x:0 if x[0] in x[1] else 1, axis = 1)
  
    DCS["Hour"] = DCS["DateTime"].apply(lambda x: x.hour)
    DCS["Minute"] = DCS["DateTime"].apply(lambda x: x.minute)
    DCS["MinuteMarker"] = DCS["Minute"].apply(lambda x: x//30)
  
    DCS = DCS[DCS.Outlier == 0].groupby(["Date","Hour","MinuteMarker"], as_index = False).agg({"Time" : max,"MelterCrown":"mean","FuelFlow" : "mean", "NCV Value" : "mean", "MelterBottom" : "mean", "FlueTemperature" : "mean", "AirFlow" : "mean"})

  else:
    DCS = DCS[DCS["DateTime"] > (DCS["DateTime"].max() - pd.Timedelta(30,unit = "m"))]
    DCS["Bin"] = DCS["Date"].apply(lambda x: IdentifyOutlier(date = x, dataset = DCS))
    DCS["Outlier"] = DCS[["FuelFlow","Bin"]].apply(lambda x:0 if x[0] in x[1] else 1, axis = 1)
  
    DCS["Hour"] = DCS["DateTime"].apply(lambda x: x.hour)
    DCS["Minute"] = DCS["DateTime"].apply(lambda x: x.minute)
    DCS["MinuteMarker"] = DCS["Minute"].apply(lambda x: x//30)
  
    DCS = DCS[DCS.Outlier == 0].groupby(["Date","Hour","MinuteMarker"], as_index = False).agg({"Time" : max,"MelterCrown":"mean","FuelFlow" : "mean", "NCV Value" : "mean", "MelterBottom" : "mean", "FlueTemperature" : "mean", "AirFlow" : "mean"})
  
  return DCS

In [0]:
def PredictFF(data, coef, PreviousFilePath):
  
  """
  Function to predict fuel flow
  Input : Dataframe from previous module along with model coefficients and path to last fuel flow prediction
  Output : New fuel flow prediction
  """

  
  
  if os.path.exists(PreviousFilePath):
    
    Last_FF_Predict = pd.read_csv(PreviousFilePath)["FF_Predict"]
    data["del_NCV"] = data["NCV Value"].diff(1)
    data["del_Temp"] = data["MelterCrown"].diff(1)
    data["del_AirFlow"] = data["AirFlow"].diff(1)
  
    data["del_FF_Predict"] = data[["del_NCV","del_Temp","del_AirFlow"]].apply(lambda x: (coef.iloc[3,1] + np.dot(x,coef.iloc[:3,1])), axis = 1)
    data["FF_Predict"] = Last_FF_Predict + data["del_FF_Predict"]
  
    data.to_csv(PreviousFilePath, index = False)
  
  else:
    data["del_NCV"] = data["NCV Value"].diff(1)
    data["del_Temp"] = data["MelterCrown"].diff(1)
    data["del_AirFlow"] = data["AirFlow"].diff(1)
  
    data["del_FF_Predict"] = data[["del_NCV","del_Temp","del_AirFlow"]].apply(lambda x: (coef.iloc[3,1] + np.dot(x,coef.iloc[:3,1])), axis = 1)
    data["FF_Predict"] = data["FuelFlow"].shift(1) + data["del_FF_Predict"]
  
    data.to_csv(PreviousFilePath, index = False)
    
  return data