#  Preprocessing Steps
Preparing the data to ensure it's clean, consistent, and suitable for training an ANN model:

 Data Import
Dataset is read into a DataFrame for analysis.

 Initial Inspection
Data types, column names, and unique values are checked to understand the structure.

 Missing Values Check
Dataset is scanned for null values; none found, so no imputation is necessary.

 Categorical to Numerical Conversion
LabelEncoder is applied to transform categorical variables (like Position, Education, etc.) into numerical format for compatibility with the ANN.

 Feature Scaling
MinMaxScaler is used to normalize the features to a 0–1 range, which improves convergence and stability during neural network training.

 Splitting Features and Target
Data is divided into:

- X: Features

- y: Target variable (Salary)



In [2]:
import pandas as pd
import numpy as np

import optuna

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adam, RMSprop, Adagrad, Adadelta, Nadam

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [3]:
data = pd.read_csv(r'Salary_Data.csv')
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [4]:
data.columns

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [5]:
data.describe(include = 'all')

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
count,6702.0,6702,6701,6702,6701.0,6699.0
unique,,3,7,193,,
top,,Male,Bachelor's Degree,Software Engineer,,
freq,,3674,2267,518,,
mean,33.620859,,,,8.094687,115326.964771
std,7.614633,,,,6.059003,52786.183911
min,21.0,,,,0.0,350.0
25%,28.0,,,,3.0,70000.0
50%,32.0,,,,7.0,115000.0
75%,38.0,,,,12.0,160000.0


In [6]:
data.isnull().sum()

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

In [7]:
data = data.dropna()

In [8]:
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [9]:
data['Education Level'].value_counts()

Education Level
Bachelor's Degree    2265
Master's Degree      1572
PhD                  1368
Bachelor's            756
High School           448
Master's              288
phD                     1
Name: count, dtype: int64

In [10]:
data['Education Level'] = np.where(data['Education Level'] == "Bachelor's", "Bachelor's Degree",
                   np.where(data['Education Level'] == "Master's", "Master's Degree",
                   np.where(data['Education Level'] == "phD", "PhD", data['Education Level'])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Education Level'] = np.where(data['Education Level'] == "Bachelor's", "Bachelor's Degree",


In [11]:
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0
1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0
4,52.0,Male,Master's Degree,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [12]:
data['Education Level'].value_counts()

Education Level
Bachelor's Degree    3021
Master's Degree      1860
PhD                  1369
High School           448
Name: count, dtype: int64

In [13]:
data['avg_experience_by_job'] = data.groupby('Job Title')['Years of Experience'].transform('mean')
data['count_by_gender_edu'] = data.groupby(['Gender', 'Education Level'])['Age'].transform('count')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['avg_experience_by_job'] = data.groupby('Job Title')['Years of Experience'].transform('mean')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['count_by_gender_edu'] = data.groupby(['Gender', 'Education Level'])['Age'].transform('count')


In [14]:
data

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,avg_experience_by_job,count_by_gender_edu
0,32.0,Male,Bachelor's Degree,Software Engineer,5.0,90000.0,4.449807,1823
1,28.0,Female,Master's Degree,Data Analyst,3.0,65000.0,4.969697,1068
2,45.0,Male,PhD,Senior Manager,15.0,150000.0,17.500000,873
3,36.0,Female,Bachelor's Degree,Sales Associate,7.0,60000.0,1.428571,1198
4,52.0,Male,Master's Degree,Director,20.0,200000.0,20.000000,790
...,...,...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0,12.068182,496
6700,32.0,Male,High School,Sales Associate,3.0,50000.0,1.428571,185
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0,10.694030,1198
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0,9.454902,790


In [15]:
data.drop('Job Title',axis = 1,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('Job Title',axis = 1,inplace = True)


In [16]:
data = pd.get_dummies(data , drop_first = True,dtype = int)

In [17]:
data

Unnamed: 0,Age,Years of Experience,Salary,avg_experience_by_job,count_by_gender_edu,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,32.0,5.0,90000.0,4.449807,1823,1,0,0,0,0
1,28.0,3.0,65000.0,4.969697,1068,0,0,0,1,0
2,45.0,15.0,150000.0,17.500000,873,1,0,0,0,1
3,36.0,7.0,60000.0,1.428571,1198,0,0,0,0,0
4,52.0,20.0,200000.0,20.000000,790,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
6699,49.0,20.0,200000.0,12.068182,496,0,0,0,0,1
6700,32.0,3.0,50000.0,1.428571,185,1,0,1,0,0
6701,30.0,4.0,55000.0,10.694030,1198,0,0,0,0,0
6702,46.0,14.0,140000.0,9.454902,790,1,0,0,1,0


In [18]:
targets = data['Salary']

inputs = data.drop(['Salary'],axis=1)

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(inputs)

scaled = scaler.transform(inputs)

inputs_scaled = pd.DataFrame(scaled, columns=inputs.columns)

inputs_scaled

Unnamed: 0,Age,Years of Experience,avg_experience_by_job,count_by_gender_edu,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,-0.213129,-0.510769,-0.811518,1.395363,0.908059,-0.045766,-0.267731,-0.620045,-0.506849
1,-0.738393,-0.840811,-0.695782,-0.144758,-1.101250,-0.045766,-0.267731,1.612785,-0.506849
2,1.493980,1.139440,2.093664,-0.542537,0.908059,-0.045766,-0.267731,-0.620045,1.972973
3,0.312135,-0.180727,-1.484093,0.120429,-1.101250,-0.045766,-0.267731,-0.620045,-0.506849
4,2.413192,1.964544,2.650204,-0.711849,0.908059,-0.045766,-0.267731,1.612785,-0.506849
...,...,...,...,...,...,...,...,...,...
6693,2.019244,1.964544,0.884454,-1.311578,-1.101250,-0.045766,-0.267731,-0.620045,1.972973
6694,-0.213129,-0.840811,-1.484093,-1.945985,0.908059,-0.045766,3.735089,-0.620045,-0.506849
6695,-0.475761,-0.675790,0.578546,0.120429,-1.101250,-0.045766,-0.267731,-0.620045,-0.506849
6696,1.625296,0.974419,0.302696,-0.711849,0.908059,-0.045766,-0.267731,1.612785,-0.506849


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=42)

#  Modeling with ANN + Optuna Optimization
- Building and tuning an Artificial Neural Network using Optuna for automated hyperparameter search:

 Hyperparameter Optimization via Optuna

- An objective function is defined to construct a Keras Sequential ANN using variable parameters.

- Optuna runs multiple trials to find the best combination of:

- Number of hidden layers

- Neurons per layer

- Dropout rate

- Learning rate

- Activation functions

- The trial with the lowest validation loss is selected as the best configuration.

 Model Construction with Best Params

- A new ANN model is built using the best hyperparameters suggested by Optuna.

- Adam optimizer is configured with the tuned learning rate.

- Dropout layers are added to mitigate overfitting.

 Model Training & Evaluation

- Model is trained on the preprocessed dataset.

Evaluated using:

- Mean Absolute Error (MAE)

- R² score

- Training vs validation loss to monitor overfitting

In [21]:
def create_model(trial):
    # Building artificial neural network
    model = Sequential()

    # Add 3 hidden layers
    model.add(Dense(units=trial.suggest_int('units_layer1', 6, 128), activation='relu'))
    model.add(Dense(units=trial.suggest_int('units_layer2', 6, 128), activation='relu'))
    model.add(Dense(units=trial.suggest_int('units_layer3', 6, 128), activation='relu'))

    # Output layer for regression – no activation or activation='linear'
    model.add(Dense(units=1, activation='relu'))  # default activation='linear'

    # Suggest hyperparameters for the optimizer
    optimizer_name = trial.suggest_categorical('optimizer', ['adam', 'sgd', 'rmsprop', 'adagrad'])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    
    if optimizer_name == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_name == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)
    elif optimizer_name == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    elif optimizer_name == 'adagrad':
        optimizer = Adagrad(learning_rate=learning_rate)

    # Compile model for regression
    model.compile(optimizer=optimizer, loss='mean_absolute_error', metrics=['mae'])
    
    return model


In [22]:

import optuna

def optimal(trial):
    # Hyperparameters
    epochs = trial.suggest_int('epochs', 10, 50)
    batch_size = trial.suggest_int('batch_size', 16, 64)
    
    model = create_model(trial)
    
    # Fit model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    # Predict
    y_pred = model.predict(X_test).flatten()  # flatten vacib ola bilər

    # Evaluation: Use R2 for regression
    r2 = r2_score(y_test, y_pred)
    
    return r2 

# Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(optimal, n_trials=10)

# Output
print(f"Best trial R2 score: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")


[I 2025-05-14 23:45:10,330] A new study created in memory with name: no-name-ca57b43f-fc34-4bc9-83f6-86065db5719e
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:45:20,763] Trial 0 finished with value: 0.7414145375927148 and parameters: {'epochs': 38, 'batch_size': 31, 'units_layer1': 37, 'units_layer2': 75, 'units_layer3': 88, 'optimizer': 'sgd', 'learning_rate': 0.00020069179392251317}. Best is trial 0 with value: 0.7414145375927148.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:45:28,086] Trial 1 finished with value: -4.614183603218622 and parameters: {'epochs': 28, 'batch_size': 35, 'units_layer1': 14, 'units_layer2': 32, 'units_layer3': 104, 'optimizer': 'sgd', 'learning_rate': 3.2315773464184205e-05}. Best is trial 0 with value: 0.7414145375927148.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:45:35,625] Trial 2 finished with value: 0.6106724844858638 and parameters: {'epochs': 33, 'batch_size': 53, 'units_layer1': 51, 'units_layer2': 73, 'units_layer3': 97, 'optimizer': 'rmsprop', 'learning_rate': 0.00038443287569014115}. Best is trial 0 with value: 0.7414145375927148.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:45:40,924] Trial 3 finished with value: -4.344488825112865 and parameters: {'epochs': 19, 'batch_size': 62, 'units_layer1': 119, 'units_layer2': 97, 'units_layer3': 74, 'optimizer': 'rmsprop', 'learning_rate': 0.00012251414688891761}. Best is trial 0 with value: 0.7414145375927148.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:45:49,359] Trial 4 finished with value: 0.7547619816575366 and parameters: {'epochs': 24, 'batch_size': 40, 'units_layer1': 122, 'units_layer2': 42, 'units_layer3': 84, 'optimizer': 'rmsprop', 'learning_rate': 0.009241337420716698}. Best is trial 4 with value: 0.7547619816575366.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:46:12,963] Trial 5 finished with value: -4.614224808301512 and parameters: {'epochs': 46, 'batch_size': 16, 'units_layer1': 39, 'units_layer2': 95, 'units_layer3': 97, 'optimizer': 'rmsprop', 'learning_rate': 1.769671780159041e-05}. Best is trial 4 with value: 0.7547619816575366.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


[I 2025-05-14 23:46:20,464] Trial 6 finished with value: -4.614196194857699 and parameters: {'epochs': 36, 'batch_size': 59, 'units_layer1': 52, 'units_layer2': 101, 'units_layer3': 82, 'optimizer': 'adagrad', 'learning_rate': 1.2682330581371137e-05}. Best is trial 4 with value: 0.7547619816575366.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:46:24,598] Trial 7 finished with value: 0.6889860568568553 and parameters: {'epochs': 13, 'batch_size': 40, 'units_layer1': 35, 'units_layer2': 46, 'units_layer3': 6, 'optimizer': 'rmsprop', 'learning_rate': 0.006564116200997292}. Best is trial 4 with value: 0.7547619816575366.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:46:44,252] Trial 8 finished with value: -4.611159601630598 and parameters: {'epochs': 47, 'batch_size': 20, 'units_layer1': 33, 'units_layer2': 8, 'units_layer3': 70, 'optimizer': 'adam', 'learning_rate': 1.01309653857388e-05}. Best is trial 4 with value: 0.7547619816575366.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


[I 2025-05-14 23:46:52,247] Trial 9 finished with value: -4.614224808301512 and parameters: {'epochs': 45, 'batch_size': 63, 'units_layer1': 51, 'units_layer2': 64, 'units_layer3': 128, 'optimizer': 'adam', 'learning_rate': 1.268434204223978e-05}. Best is trial 4 with value: 0.7547619816575366.


Best trial R2 score: 0.7547619816575366
Best hyperparameters: {'epochs': 24, 'batch_size': 40, 'units_layer1': 122, 'units_layer2': 42, 'units_layer3': 84, 'optimizer': 'rmsprop', 'learning_rate': 0.009241337420716698}


In [26]:
best_params = study.best_trial.params

best_params

{'epochs': 24,
 'batch_size': 40,
 'units_layer1': 122,
 'units_layer2': 42,
 'units_layer3': 84,
 'optimizer': 'rmsprop',
 'learning_rate': 0.009241337420716698}

In [44]:
best_model = Sequential()
best_model.add(Dense(units=best_params['units_layer1'], activation='relu'))
best_model.add(Dense(units=best_params['units_layer2'], activation='relu'))
best_model.add(Dense(1, activation='relu'))

In [45]:
if best_params['optimizer'] == 'adam':
    best_optimizer = Adam(learning_rate=best_params['learning_rate'])
elif best_params['optimizer'] == 'sgd':
    best_optimizer = SGD(learning_rate=best_params['learning_rate'])
elif best_params['optimizer'] == 'rmsprop':
    best_optimizer = RMSprop(learning_rate=best_params['learning_rate'])
elif best_params['optimizer'] == 'adagrad':
    best_optimizer = Adagrad(learning_rate=best_params['learning_rate'])


In [46]:
best_model.compile(optimizer=best_optimizer, loss='mean_absolute_error', metrics=['mae'])

In [47]:
def evaluate(model, X_train, y_train, X_test, y_test):

    model.fit(X_train, y_train, epochs=25, batch_size=best_params['batch_size'])

    '''Predictions and probabilities for the training set'''

    y_train_prob = model.predict(X_train)

    '''Predictions and probabilities for the test set'''

    y_test_prob = model.predict(X_test)

    '''Calculate metrics for the training set''' 

    mae_train = mean_absolute_error(y_train, y_train_prob)
    r2_train = r2_score(y_train, y_train_prob)


    '''Calculate metrics for the test set'''

    mae_test = mean_absolute_error(y_test, y_test_prob)
    r2_test = r2_score(y_test, y_test_prob)


    results = pd.DataFrame({
        'Dataset': ['Train', 'Test'],
        'MAE': [mae_train, mae_test],
        'R2': [r2_train, r2_test],

    })

    return results


In [48]:
evaluate(best_model, X_train, y_train, X_test, y_test)

Epoch 1/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 111977.3594 - mae: 111977.3594 
Epoch 2/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 75533.2578 - mae: 75533.2578
Epoch 3/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 30441.3262 - mae: 30441.3262
Epoch 4/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 26107.2012 - mae: 26107.2012
Epoch 5/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 23671.2031 - mae: 23671.2031
Epoch 6/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 22040.9980 - mae: 22041.0000
Epoch 7/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 20948.3359 - mae: 20948.3359
Epoch 8/25
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 20541.1680 - mae: 20541.1680
Epoch 9/25
[

Unnamed: 0,Dataset,MAE,R2
0,Train,19558.092249,0.721467
1,Test,19756.640833,0.728332


#  Deployment Steps
- Final steps to prepare the pipeline for real-world predictions:

 New DataFrame Creation
- A new input DataFrame (df1) is defined with relevant features like age, gender, education, job title, and experience.

 Feature Engineering

- Added a new column: avg_experience_by_job, which stores the average experience for each job title.

- Added count_by_gender_edu, representing how many individuals share the same gender and education level.

 Dropping Unused Columns
- Removed Job Title as it has been encoded indirectly through aggregated features.

 Encoding Categorical Variables
- Used get_dummies with drop_first=True to convert categorical columns into numeric format.

 Feature Scaling
- Applied the pre-fitted MinMaxScaler to transform the new input data (df1) to the same scale as the training data.

 Prediction with Trained Model
- Used the saved ANN model (best_model) to predict salaries based on the scaled features.

 Output Formatting
- Created a new DataFrame (df_test_scaled) containing the scaled inputs and the model’s predicted salary for each individual.

In [74]:
df = {
    "Age": [25, 30, 28, 35, 22],
    "Gender": ["Male", "Female", "Female", "Other", "Female"],
    "Education Level": ["Bachelor's Degree", "Master's Degree", "PhD", "High School", "Master's Degree"],
    "Job Title": ["Data Analyst", "Software Engineer", "Data Scientist", "Sales Executive", "Director"],
    "Years of Experience": [2, 5, 4, 10, 1]
}

df1 = pd.DataFrame(df)

df1

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience
0,25,Male,Bachelor's Degree,Data Analyst,2
1,30,Female,Master's Degree,Software Engineer,5
2,28,Female,PhD,Data Scientist,4
3,35,Other,High School,Sales Executive,10
4,22,Female,Master's Degree,Director,1


In [76]:
df1.describe(include='all')

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience
count,5.0,5,5,5,5.0
unique,,3,4,5,
top,,Female,Master's Degree,Data Analyst,
freq,,3,2,1,
mean,28.0,,,,4.4
std,4.949747,,,,3.507136
min,22.0,,,,1.0
25%,25.0,,,,2.0
50%,28.0,,,,4.0
75%,30.0,,,,5.0


In [78]:
df1['avg_experience_by_job'] = df1.groupby('Job Title')['Years of Experience'].transform('mean')
df1['count_by_gender_edu'] = df1.groupby(['Gender', 'Education Level'])['Age'].transform('count')

In [80]:
df1

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,avg_experience_by_job,count_by_gender_edu
0,25,Male,Bachelor's Degree,Data Analyst,2,2.0,1
1,30,Female,Master's Degree,Software Engineer,5,5.0,2
2,28,Female,PhD,Data Scientist,4,4.0,1
3,35,Other,High School,Sales Executive,10,10.0,1
4,22,Female,Master's Degree,Director,1,1.0,2


In [84]:
df1.drop('Job Title',axis = 1,inplace = True)

In [88]:
df1= pd.get_dummies(df1 , drop_first = True,dtype = int)
df1

Unnamed: 0,Age,Years of Experience,avg_experience_by_job,count_by_gender_edu,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,25,2,2.0,1,1,0,0,0,0
1,30,5,5.0,2,0,0,0,1,0
2,28,4,4.0,1,0,0,0,0,1
3,35,10,10.0,1,0,1,1,0,0
4,22,1,1.0,2,0,0,0,1,0


In [90]:
scaler.fit(df1)

scaled = scaler.transform(df1)

df_test_scaled = pd.DataFrame(scaled, columns=df1.columns)

df_test_scaled

Unnamed: 0,Age,Years of Experience,avg_experience_by_job,count_by_gender_edu,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD
0,-0.677631,-0.765092,-0.765092,-0.816497,2.0,-0.5,-0.5,-0.816497,-0.5
1,0.451754,0.191273,0.191273,1.224745,-0.5,-0.5,-0.5,1.224745,-0.5
2,0.0,-0.127515,-0.127515,-0.816497,-0.5,-0.5,-0.5,-0.816497,2.0
3,1.581139,1.785215,1.785215,-0.816497,-0.5,2.0,2.0,-0.816497,-0.5
4,-1.355262,-1.08388,-1.08388,1.224745,-0.5,-0.5,-0.5,1.224745,-0.5


In [94]:
df1['salary'] = best_model.predict(df_test_scaled)


df1

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


Unnamed: 0,Age,Years of Experience,avg_experience_by_job,count_by_gender_edu,Gender_Male,Gender_Other,Education Level_High School,Education Level_Master's Degree,Education Level_PhD,salary
0,25,2,2.0,1,1,0,0,0,0,103902.523438
1,30,5,5.0,2,0,0,0,1,0,139553.515625
2,28,4,4.0,1,0,0,0,0,1,143504.4375
3,35,10,10.0,1,0,1,1,0,0,60795.523438
4,22,1,1.0,2,0,0,0,1,0,102052.0625
