# DATA PREPARATION 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Importing research data
#url = 'https://github.com/WisnuHanif/reactor_data/blob/main/reactor_data.csv'
prep0 = pd.read_csv('https://raw.githubusercontent.com/WisnuHanif/reactor_data/main/reactor_data.csv')
prep0.head()

In [None]:
#Identity variables name
prep0.columns

In [None]:
#Removing variable description & 'NO', 'Time' column
prep1 = prep0.iloc[1:, :].drop(['Running_cycle','Time'], axis=1)
prep1.head()

In [None]:
prep1.dtypes

In [None]:
#Convert timestamp object data to numerical
prep2 = prep1.apply(pd.to_numeric)
print(prep2.dtypes, prep2.shape)

In [None]:
#Check if there's missing value
prep2.isnull().sum()

In [None]:
#Removing data where plant is not run, by identifying total raw material 'FI-001' loss flow rate
import seaborn as sns
sns.boxplot(data=prep2,x=prep2['FI-001'])

In [None]:
#Remove shut down data by identifying outlier FI-001 with Inter Quantile Range Method

from numpy import percentile
# calculate interquartile range
q25_a, q75_a = percentile(prep2['FI-001'], 25), percentile(prep2['FI-001'], 75)
iqr_a = q75_a - q25_a
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25_a, q75_a, iqr_a))
# calculate the outlier cutoff
cut_off_a = iqr_a * 1.5
lower_a, upper_a = q25_a - cut_off_a, q75_a + cut_off_a
print('Lower whisker=%.2f, Upper whisker=%.2f' % (lower_a, upper_a))
# identify outliers
shut_down_data = prep2[(prep2['FI-001']<lower_a)|(prep2['FI-001']>upper_a)]
print('Shut down data: %d' % len(shut_down_data))
# remove outliers
shut_down_removed = prep2[(prep2['FI-001']>lower_a)&(prep2['FI-001']<upper_a)]
print('Non-Shut down data: %d' % len(shut_down_removed))

In [None]:
#Check again if there's still outlier in 'FI-001'
sns.boxplot(data=shut_down_removed, x=shut_down_removed['FI-001'])

In [None]:
prep3 = shut_down_removed
prep3.shape

In [None]:
#Remove outlier for all variables while keeps the whole row intact
lb = prep3.quantile(0.01)
ub = prep3.quantile(0.99)

prep4 = prep3[(prep3 > lb) & (prep3 < ub)]
prep4

In [None]:
prep4.info()

In [None]:
#Check deleted value position
import missingno as mno
mno.matrix(prep4, figsize = (20, 6))

In [None]:
#Correlation matrix between variables before missing value imputation
corr = prep4.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr.values[np.triu_indices_from(corr.values,1)].sum()

In [None]:
#Fill missing value (from removed outlier) with imputer
prep5 = prep4.interpolate(method ='linear', limit_direction ='backward')
prep5.head()

In [None]:
#plt.figure(figsize=(120, 60))
#heatmap = sns.heatmap(prep3.corr(), vmin=-1, vmax=1, annot=True, cmap='coolwarm')
#heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);

In [None]:
prep5.isnull().sum()

In [None]:
prep4.describe().transpose()

In [None]:
#Visualization plot for all variables
group_1 = prep5.iloc[0:4000,0:10]
group_2 = prep5.iloc[0:4000,10:20]
group_3 = prep5.iloc[0:4000,20:30]
group_4 = prep5.iloc[0:4000:,30:40]
group_5 = prep5.iloc[0:4000:,40:50]
group_6 = prep5.iloc[0:4000:,50:60]
group_7 = prep5.iloc[0:4000:,60:64]

In [None]:
#Plot for group 1
group_1.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 2
group_2.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 3
group_3.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 4
group_4.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 5
group_5.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 6
group_6.plot(subplots =True, sharex = True, figsize = (30,80))

In [None]:
#Plot for group 7
group_7.plot(subplots =True, sharex = True, figsize = (40,40))

In [None]:
#sns.boxplot(prep5['FI-001'])

In [None]:
corr2 = prep5.corr()
corr2.style.background_gradient(cmap='coolwarm')

In [None]:
#Data Scaling with normalization
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
# transform data
#scaled_data = pd.DataFrame(scaler.fit_transform(prep5), columns = prep5.columns)
#print(scaled_data)

In [None]:
#Dataset after preparation and cleaning 
data = prep5

# MACHINE LEARNING MODEL : RANDOM FORREST REGRESSION

In [None]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(-999, inplace = True)

In [None]:
#import data for RamdomForrestRegressor

#features = data.drop('CONVERSION', axis = 1).values.astype(float).reshape(-1, 63)
#labels = data['CONVERSION'].values.astype(float)
features = data.drop('CONVERSION', axis = 1)
labels = data['CONVERSION']

# Convert to numpy array
features = np.array(features)

# Saving feature names for later use
feature_list = list(data.drop('CONVERSION', axis = 1).columns)

In [None]:
print(features.shape, labels.shape)

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
%timeit
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(train_features, train_labels)
print(train_features.shape, train_labels.shape)

In [None]:
train_features

In [None]:
y_pred = regressor.predict(test_features)
y_pred

In [None]:
# evaluate predictions
from sklearn import metrics
from sklearn.metrics import accuracy_score

print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, y_pred)))

In [None]:
# Calculate the absolute errors
errors = abs(y_pred - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# MACHINE LEARNING MODEL : SUPPORT VECTOR REGRESSION

In [None]:
#1 Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# transform data
svr_data = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

In [None]:
#2. import data for RamdomForrestRegressor

x = svr_data.drop('CONVERSION', axis = 1).values.astype(float).reshape(-1, 63)
y = svr_data['CONVERSION'].values.astype(float)
print(x.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

In [None]:
from sklearn.svm import SVR
SVRModel = SVR(kernel = 'rbf')
SVRModel.fit(x_train, y_train)

In [None]:
svr_pred = SVRModel.predict(x_test)
svr_pred

In [None]:
#Evaluating SVR performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, svr_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, svr_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, svr_pred)))