# Verifying if the SSPL Column follows a Normal Distribution

If it does, we can use the Standard Deviation to create the acceptable range like this: Actual Value+-ySD where y is an appropriate factor.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

# Load the dataset
data = pd.read_csv("AirfoilSelfNoise.csv")

# Extract the SSPL column
sspl_data = data['SSPL']

# Plot a histogram
plt.figure(figsize=(8, 6))
plt.hist(sspl_data, bins=30, edgecolor='k')
plt.xlabel('SSPL')
plt.ylabel('Frequency')
plt.title('Histogram of SSPL')
plt.show()

# Create a Q-Q plot
plt.figure(figsize=(6, 6))
stats.probplot(sspl_data, dist="norm", plot=plt)
plt.title('Q-Q Plot of SSPL')
plt.show()

# Perform Shapiro-Wilk test
statistic, p_value = stats.shapiro(sspl_data)
print("Shapiro-Wilk Test:")
print("Statistic:", statistic)
print("p-value:", p_value)

# Not a Normal Distribution and Trying IQR

Based on the histogram, we can conclude that the SSPL column has a negative skew and thus, we cannot use the SD as a value for the range. For this reason, we can try to use the IQR to create the range as the IQR is more resistant to skews.

In [None]:
import numpy as np

# Extract the SSPL column
sspl_data = data['SSPL']

std = np.std(sspl_data)

print("Standard Deviation:", std)

# Calculate the IQR
q1 = np.percentile(sspl_data, 25)
q3 = np.percentile(sspl_data, 75)
iqr = q3 - q1

print("Interquartile Range (IQR):", iqr)

# Finding a suitable factor for IQR

Since the IQR is 9.8 (close to 10), we cannot use it as is. Decibels follow a logarithmic increase, where an increase of 10db means the sound is 10 times louder. So I've set the suitable factor to be 0.5 but if accuracy is of the highest priority, this factor can be decreased.

# Starting with Model Creation

First comes data processing to ensure all the data is an acceptable format, then comes the model and finally, an evaluation based on how many values in the test set fall within the acceptable range.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten
from sklearn.metrics import mean_absolute_error, mean_squared_error
from keras.callbacks import EarlyStopping

In [None]:
# Step 1: Load the dataset
train_data = pd.read_csv('training_set.csv')
val_data = pd.read_csv('validation_set.csv')
test_data = pd.read_csv('test_set.csv')

# Step 2: Prepare the data
X_train = train_data[['f', 'alpha', 'c', 'U_infinity', 'delta']].values
y_train = train_data['SSPL'].values

X_val = val_data[['f', 'alpha', 'c', 'U_infinity', 'delta']].values
y_val = val_data['SSPL'].values

X_test = test_data[['f', 'alpha', 'c', 'U_infinity', 'delta']].values
y_test = test_data['SSPL'].values

In [None]:
# Step 3: Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(-1, 1, 5)  # Reshape training data
X_val = X_val.reshape(-1, 1, 5)  # Reshape validation data
X_test = X_test.reshape(-1, 1, 5)  # Reshape test data

In [None]:
# Step 4: Build the model
model = Sequential()
model.add(Bidirectional(LSTM(128, activation='relu'), input_shape=(1, 5)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1))

In [None]:
# Step 5: Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Step 6: Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
model.fit(X_train, y_train, batch_size=16, epochs=100, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Step 7: Evaluate the model
y_pred = model.predict(X_test)

In [None]:
# To Calculate how many predictions fall within the range of ActualValue+-IQR*0.5
count_within_range = 0

# Print actual vs predicted values and count predictions within range
for i in range(len(y_pred)):
    print(f"Actual: {y_test[i]}, Predicted: {y_pred[i]}")
    if y_test[i] - iqr*0.5 <= y_pred[i] <= y_test[i] + iqr*0.5:
        count_within_range += 1

print("Count within range:", count_within_range/len(y_test))

# Statistics of 10 Runs

Predictions within Range: 91,95,93,93,92,86,86,89,86,89
Mean, μ: 90
Standard Deviation, σ: 3.1304951684997

From this, we can conclude the model is performing well, with minimal inconsistency.