# Thermoacoustic stability prediction using binary classification algorithms
### Part 1: Models

The objective of this project is to predict the overall thermoacoustic stability of a combustor using binary classification algorithms.

---

Data source: Artificial data generated with a modified version of OSCILOS_lite (freely available at https://github.com/MorgansLab/OSCILOS_lite)

---

Author: Renaud Gaudron\
Version: 5.0\
Latest update: 18/08/21

## Step 1: Preprocessing of the artificial data

In [1]:
# Importing the relevant libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

In [2]:
# Opening and printing to log file

sourceFile = open('Binary_Classification_log.txt', 'w')

print("----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Thermoacoustic stability prediction using binary classification algorithms\n".upper(),file=sourceFile)
print("----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Data source: Artificial data generated with a modified version of OSCILOS_lite (freely available at https://github.com/MorgansLab/OSCILOS_lite)\n",file=sourceFile)
print("Author: Renaud Gaudron\n",file=sourceFile)
print("Version: 5.0\n",file=sourceFile)
print("Latest update: 18/08/21\n",file=sourceFile)
print("----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 1 : Preprocessing of the artificial data".upper(),file=sourceFile)

In [3]:
# Loading the data

data = pd.read_csv("Cases_large.txt", sep="\t",usecols=list(range(20))) # Storing data as a Pandas framework

headers=["X0","X1","X2","X3","R0","R1","R2","R3","Gain","Phase","0_50Hz","50_100Hz","100_150Hz","150_200Hz","200_250Hz","250_300Hz","300_350Hz","350_400Hz","400_450Hz","450_500Hz"] 
data.columns = headers # Adding headers to the data

data["Stability"] = np.heaviside(data["0_50Hz"]+data["50_100Hz"]+data["100_150Hz"]+data["150_200Hz"]+data["200_250Hz"]+data["250_300Hz"]+data["300_350Hz"]+data["350_400Hz"]+data["400_450Hz"]+data["450_500Hz"],0).astype('int')

# The stability column indicates whether a combustor is practically stable (0) or a practically unstable (1).

# Let's drop the unused columns
data=data.drop(columns=['X0','R3',"0_50Hz","50_100Hz","100_150Hz","150_200Hz","200_250Hz","250_300Hz","300_350Hz","350_400Hz","400_450Hz","450_500Hz"])

# X represents the absolute axial position of the element. Let's compute Li=Xi-Xi-1 corresponding to the length of each element.

data['X3']=data['X3']-data['X2']
data['X2']=data['X2']-data['X1']

data.rename(columns={"X1":"L1"}, inplace=True) # We just need to change the heading for this one
data.rename(columns={"X2":"L2"}, inplace=True) # We just need to change the heading for this one
data.rename(columns={"X3":"L3"}, inplace=True) # We just need to change the heading for this one

#data=data[:100] # FOR VALIDATION ONLY - Select the first 100 examples

print("\nTop 10 lines of the pre-processed data:\n",file=sourceFile)
print(round(data.head(10),4).to_markdown(index = False, tablefmt="fancy_grid"),file=sourceFile)

data.head(10) # Display the top 10 lines

Unnamed: 0,L1,L2,L3,R0,R1,R2,Gain,Phase,Stability
0,0.215282,0.323424,0.170701,0.082752,0.053804,0.010102,0.885752,2.551514,1
1,0.140167,0.188399,0.423352,0.089381,0.08926,0.078452,0.518373,3.134972,1
2,0.166067,0.111135,0.155982,0.023319,0.084748,0.05522,0.61551,2.905676,1
3,0.159184,0.338507,0.276718,0.020077,0.042646,0.056869,0.875384,4.900099,1
4,0.488712,0.281809,0.156784,0.081084,0.035787,0.058883,0.765332,1.543422,1
5,0.229345,0.362074,0.343772,0.060946,0.033539,0.052616,0.829335,3.811601,1
6,0.172424,0.148255,0.52572,0.045043,0.078339,0.020235,0.982497,4.370344,1
7,0.631154,0.050756,0.200507,0.069525,0.018376,0.017569,0.756406,2.116734,1
8,0.51198,0.048393,0.31964,0.035053,0.02252,0.09856,0.73309,4.961723,1
9,0.502112,0.200094,0.108543,0.033783,0.069184,0.019358,0.757796,3.491583,1


In [4]:
data.dtypes # Checking the data types for each column

L1           float64
L2           float64
L3           float64
R0           float64
R1           float64
R2           float64
Gain         float64
Phase        float64
Stability      int64
dtype: object

In [5]:
print("\n{} stable configurations - {} unstable configurations.\n".format(data["Stability"].value_counts()[0],data["Stability"].value_counts()[1]),file=sourceFile)
print("\n{} stable configurations - {} unstable configurations.\n".format(data["Stability"].value_counts()[0],data["Stability"].value_counts()[1]))


149701 stable configurations - 401656 unstable configurations.



In [6]:
# Computing the Kendall correlation coefficients

Kendall_corr=data.corr(method='kendall') # Compute the correlation between the different variables
Kendall_corr.to_csv(r'Kendall_correlations_Binary.csv', index = False) # Storing to a data file

print("Correlations between the different variables:\n",file=sourceFile)
print(round(Kendall_corr,3).to_markdown(index = True, tablefmt="fancy_grid"),file=sourceFile)

## Step 2: Machine Learning setup

In [7]:
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 2 : Machine Learning setup\n".upper(),file=sourceFile)

# Defining the feature matrix and response vectors

X=data[['L1','L2','L3','R0','R1','R2','Gain','Phase']].values # Feature Matrix
y=data["Stability"].values # Response vector

print(X.shape)
print(y.shape)

(551357, 8)
(551357,)


In [8]:
# Normalise the feature vector

X=preprocessing.StandardScaler().fit(X).transform(X)

In [9]:
# Train/test split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=3)
print('Size of training set Feature Matrix {}'.format(X_train.shape),file=sourceFile)
print('Size of testing set Feature Matrix {}'.format(X_test.shape),file=sourceFile)
print('Size of training set Feature Matrix {}'.format(X_train.shape))
print('Size of testing set Feature Matrix {}'.format(X_test.shape))

Size of training set Feature Matrix (441085, 8)
Size of testing set Feature Matrix (110272, 8)


## Step 3: K-nearest neighbours

In [10]:
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 3 : K-nearest neighbours\n".upper(),file=sourceFile)

## Building the K nearest neighbours (KNN) models

t = time.time()

Ks=50 # Maximum number of neighbours

parameters_Kneigh = [
    {
        'n_neighbors': range(1,Ks+1),
    },
]

#KNeighborsClassifier().get_params()

search_Kneigh = GridSearchCV(KNeighborsClassifier(), parameters_Kneigh, scoring='accuracy', n_jobs=-1)
search_Kneigh.fit(X_train,y_train)

pred_Kneigh_test = search_Kneigh.predict(X_test)
mean_acc_test=metrics.accuracy_score(y_test,pred_Kneigh_test)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Kneigh.best_params_['n_neighbors'], "neighbour(s)",file=sourceFile)
print("Elapsed time:",round((time.time() - t),2),"seconds",file=sourceFile)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Kneigh.best_params_['n_neighbors'], "neighbour(s)")
print("\nElapsed time:",round((time.time() - t),2),"seconds")

result_Kneigh=search_Kneigh.cv_results_['mean_test_score']
np.savetxt("Results_Kneigh_Binary.csv", result_Kneigh, delimiter=",") # Saving results to a separate file

The highest testing accuracy is 99.295 % obtained for 1 neighbour(s)

Elapsed time: 747.94 seconds


## Step 4: Decision Trees

In [11]:
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 4 : Decision Trees\n".upper(),file=sourceFile)

## Building the Decision Trees models

t = time.time()

Ks=50 # Maximum number of layers

parameters_Tree = [
    {
        'max_depth': range(1,Ks+1),
    },
]

search_Tree = GridSearchCV(DecisionTreeClassifier(criterion="entropy"), parameters_Tree, scoring='accuracy', n_jobs=-1)
search_Tree.fit(X_train,y_train)

pred_Tree_test = search_Tree.predict(X_test)
mean_acc_test=metrics.accuracy_score(y_test,pred_Tree_test)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Tree.best_params_['max_depth'], "layer(s)",file=sourceFile)
print("Elapsed time:",round((time.time() - t),2),"seconds",file=sourceFile)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Tree.best_params_['max_depth'], "layer(s)")
print("\nElapsed time:",round((time.time() - t),2),"seconds")

result_Tree=search_Tree.cv_results_['mean_test_score']
np.savetxt("Results_Tree_Binary.csv", result_Tree, delimiter=",") # Saving results to a separate file

The highest testing accuracy is 99.381 % obtained for 46 layer(s)

Elapsed time: 93.13 seconds


## Step 5: Random forests

In [12]:
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 5 : Random forests\n".upper(),file=sourceFile)

## Building the Random forest models for various numbers of trees in the forest

t = time.time()

Ks=100 # Maximum number of trees in the forest

parameters_Forest = [
    {
        'n_estimators': range(1,Ks+1),
    },
]

search_Forest = GridSearchCV(RandomForestClassifier(criterion='entropy'), parameters_Forest, scoring='accuracy', n_jobs=-1)
search_Forest.fit(X_train,y_train)

pred_Forest_test = search_Forest.predict(X_test)
mean_acc_test=metrics.accuracy_score(y_test,pred_Forest_test)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Forest.best_params_['n_estimators'], "tree(s)",file=sourceFile)
print("Elapsed time:",round((time.time() - t),2),"seconds",file=sourceFile)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained for",search_Forest.best_params_['n_estimators'], "tree(s)")
print("\nElapsed time:",round((time.time() - t),2),"seconds")

result_Forest=search_Forest.cv_results_['mean_test_score']
np.savetxt("Results_Forest_Binary.csv", result_Forest, delimiter=",") # Saving results to a separate file



The highest testing accuracy is 99.555 % obtained for 95 tree(s)

Elapsed time: 2530.31 seconds


## Step 6: Multilayer Perceptrons

In [13]:
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)
print("Step 6 : Multilayer Perceptrons\n".upper(),file=sourceFile)

## Building the Multilayer Perceptron models for various hyperparameters

t = time.time()

parameters_MLP = [
    {
        'alpha': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3],  # Regularisation parameter
        'learning_rate_init': [0.0001, 0.0003, 0.001, 0.003, 0.006, 0.01], # Initial learning rate
    },
]

search_MLP = GridSearchCV(MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(100,100,100),random_state=1, max_iter=10000, verbose=False, tol=1e-4, batch_size='auto'), parameters_MLP, scoring='accuracy', n_jobs=-1)
search_MLP.fit(X_train,y_train)

pred_MLP_test = search_MLP.predict(X_test)
mean_acc_test=metrics.accuracy_score(y_test,pred_MLP_test)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained with alpha=",search_MLP.best_params_['alpha'], "and initial learning rate=",search_MLP.best_params_['learning_rate_init'],file=sourceFile)
print("Elapsed time:",round((time.time() - t),2),"seconds",file=sourceFile)

print("The highest testing accuracy is",round(100*mean_acc_test,3),"% obtained with alpha=",search_MLP.best_params_['alpha'], "and initial learning rate=",search_MLP.best_params_['learning_rate_init'])
print("\nElapsed time:",round((time.time() - t),2),"seconds")
print("\n----------------------------------------------------------------------------------------------------------------------------\n",file=sourceFile)

The highest testing accuracy is 98.235 % obtained with alpha= 0.001 and initial learning rate= 0.001

Elapsed time: 28678.36 seconds


## Step 7: Closing the file

In [14]:
sourceFile.close()