In [None]:
# Giving a seed value.Apparently you may use different seed values at each stage
seed_value= 0

# 4 steps that are neeeded to be done for the code to be reproducible.

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as nm
nm.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

# 5. Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

# Check GPU availability
print("GPU Available:", tf.config.list_physical_devices("GPU"))

# Check CPU availability
print("CPU Available:", tf.config.list_physical_devices("CPU"))

#Import all the necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler 
from keras.models import Sequential #importing Sequential Model.It uses tensorflow backend and you need to have tensorflow setup. 
from keras.layers import Dense #Importing default dense layer
from sklearn.utils import shuffle
from keras.regularizers import l1, l2
from keras.optimizers import SGD
from sklearn.metrics import accuracy_score
import time
from matplotlib import pyplot

#Access all the necessary variables and datasets
#random_forest_data has all the chemistry compositions and microalloy elements compositions and output column been modified.
%store -r random_forest_data 
%store -r top_features_data
rows,cols=random_forest_data.shape #161 statistical features and one output column
top_features_data #coming from random forest code

%store -r feature_imp_RF 
#using feature_imp_RF from Random_Forest_Code which is a dataframe containing features and its corresponding importance score.

total_features=feature_imp_RF["Feature"] #considering all 161 features from random forest
total_features_list= total_features.tolist()

total_features.shape

In [None]:
def training_data(new_features,n_batch):
    #Creating a new data
    new_data=random_forest_data.loc[:,new_features]
    new_data = shuffle(new_data, random_state=42)

    #1 Splitting the new_data into train and test
    new_total_length= new_data.shape[0]
    new_train_size=int(new_data.shape[0]*0.8)
    new_test_size=new_data.shape[0]-new_train_size

    new_train_X=new_data.iloc[:,:][0:new_train_size].to_numpy().astype(nm.float32) #2D array
    new_train_Y=random_forest_data['OUTPUT'][0:new_train_size].to_numpy().astype(nm.float32) #1D array
    new_train_Y=nm.reshape(new_train_Y,(new_train_Y.shape[0],1)) # 2D array

    new_test_X=new_data.iloc[:,:][new_train_size:].to_numpy().astype(nm.float32) #2D array
    new_test_Y=random_forest_data['OUTPUT'][new_train_size:].to_numpy().astype(nm.float32) #1D array
    new_test_Y=nm.reshape(new_test_Y,(new_test_Y.shape[0],1)) # 2D array}

    #2 Scaling the data
    scaler=RobustScaler()
    new_train_X=scaler.fit_transform(new_train_X) 
    new_test_X=scaler.transform(new_test_X)

    #3 Create a new model with the updated feature set
    new_model=Sequential() 
    new_model.add(Dense(50,input_dim=len(new_features),activation='relu',kernel_regularizer=l2(0.01),kernel_initializer='he_uniform')) # one hidden layer
    new_model.add(Dense(1,activation='sigmoid',kernel_regularizer=l2(0.01))) #output layer

    #4 Compile the new model
    new_model.compile(loss='binary_crossentropy',optimizer=SGD(lr=0.01,momentum=0.9),metrics=['accuracy'])

    #5 Train the new model
    history=new_model.fit(new_train_X,new_train_Y,epochs=200,batch_size=n_batch,validation_data=(new_test_X,new_test_Y))

    #6 Evaluate the new model
    testPredict_probs_new=new_model.predict(new_test_X)
    threshold = 0.5  # Adjust this threshold as needed
    y_new_pred = (testPredict_probs_new > threshold).astype(int)
    new_accuracy = accuracy_score(new_test_Y,y_new_pred)
    
    return new_accuracy,history
    

In [None]:
def forward_selection(n_batch):
    # Start the timer
    start_time = time.time()
    
    selected_features=[] # starting with a empty feature set
    selected_features_accuracy=[] # for storing accuracy of selected feature
    condition=False
    count=0 #to know how many times my loop runs
    accuracy=-nm.inf #starting with a very low accuracy
    
    while (condition==False):
        remaining_features=[]# Remaining features will change at each iteration
        
        for feature in total_features_list:
            if feature not in selected_features:
                remaining_features.append(feature) 
                
        temp_accuracy_store=[]
        temp_feature_array=[]
        last_feature=remaining_features[-1]
        temp_model_history=[]
        
        for feature in remaining_features:
            # Add one feature to the feature set
            new_features = selected_features + [feature]
            print(feature)
            new_accuracy,new_model_history=training_data(new_features,n_batch)
            temp_accuracy_store.append(new_accuracy)
            temp_feature_array.append(feature)
            temp_model_history.append(new_model_history)
           
        maximum_accuracy=max(temp_accuracy_store)
            
        # Check for convergene. Stopping when no improvement.
        if maximum_accuracy<=accuracy:
            condition=True
            break       

        # Update the baseline feature set if the accuracy score improved
        else:
            max_idx=nm.argmax(temp_accuracy_store)
            accuracy=maximum_accuracy #updating the accuracy
            selected_features.append(temp_feature_array[max_idx]) #updating the selected features
            selected_features_accuracy.append(accuracy)
            history=temp_model_history[max_idx]
            count=count+1
            # plot learning curves only for the feature set selected each time
            pyplot.plot(history.history['accuracy'], label='train')
            pyplot.plot(history.history['val_accuracy'], label='test')
            pyplot.title('batch=' + str(n_batch) + ',Accuracy=' + str(round(accuracy * 100, 4)) + '\nFeature=' + temp_feature_array[max_idx], pad=-40, fontsize='10')
            pyplot.legend()
            pyplot.savefig(f'learning_curves_iteration{count}.png')
            pyplot.clf()
            
    # Stop the timer and calculate the elapsed time
    end_time = time.time()
    training_time = end_time - start_time       
    return selected_features, selected_features_accuracy, training_time,temp_accuracy_store,temp_model_history,history

In [None]:
selected_features,selected_features_accuracy,training_time,temp_accuracy_store,temp_model_history,history=forward_selection(472)

In [None]:
history.history['val_accuracy']

In [None]:
selected_features[0]

In [None]:
print(training_time/60)

In [None]:
selected_features_accuracy[0]

In [None]:
%store selected_features
%store selected_features_accuracy

selected_features

In [None]:
str('40','59','max')