## Creating a deep learning model for salary prediction 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras import layers, models

In [4]:
# Importing the data 
path = "Data/data_after_feature_eng.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Company Size,Job Title,min_experience,max_experience,average_experience,min_salary,max_salary,average_salary,Level,Company_size,...,Qualifications_MCA,Qualifications_PhD,Work Type_Contract,Work Type_Full-Time,Work Type_Intern,Work Type_Part-Time,Work Type_Temporary,Preference_Both,Preference_Female,Preference_Male
0,48990,Digital Marketing Specialist,5,15,10.0,59000,99000,79000.0,1,1,...,False,False,False,False,True,False,False,False,True,False
1,340,Web Developer,2,12,7.0,56000,116000,86000.0,0,2,...,False,False,False,False,True,False,False,False,True,False
2,106713,Operations Manager,0,12,6.0,61000,104000,82500.0,2,0,...,False,True,False,False,False,False,True,False,False,True
3,30240,Network Engineer,4,11,7.5,65000,91000,78000.0,2,2,...,False,True,False,True,False,False,False,False,True,False
4,76132,Event Manager,1,12,6.5,64000,87000,75500.0,1,0,...,False,False,False,False,True,False,False,False,True,False


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610462 entries, 0 to 1610461
Data columns (total 68 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Company Size           1610462 non-null  int64  
 1   Job Title              1610462 non-null  object 
 2   min_experience         1610462 non-null  int64  
 3   max_experience         1610462 non-null  int64  
 4   average_experience     1610462 non-null  float64
 5   min_salary             1610462 non-null  int64  
 6   max_salary             1610462 non-null  int64  
 7   average_salary         1610462 non-null  float64
 8   Level                  1610462 non-null  int64  
 9   Company_size           1610462 non-null  int64  
 10  feature_1              1610462 non-null  float64
 11  feature_2              1610462 non-null  float64
 12  feature_3              1610462 non-null  float64
 13  feature_4              1610462 non-null  float64
 14  feature_5         

In [5]:
# Drop unnecessary columns
df = df.drop(columns=['min_salary', 'max_salary', 'Company_size'])

### Handling Categorical Variables

In [6]:
# Job Title is the only categorical feature that needs one-hot encoding
categorical_features = ['Job Title']
numeric_features = df.columns[df.columns.str.startswith('feature_')].tolist() + [
    'min_experience', 'max_experience', 'average_experience', 'Company Size', 'Level'
]

In [7]:
# Handling Numerical Variables and Creating Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

### Splitting Data

In [8]:
X = df.drop(columns=['average_salary'])
y = df['average_salary']

In [9]:
X_preprocessed = preprocessor.fit_transform(X)

In [15]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2)

### Model Building

In [42]:
# Build the neural network model
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1)  # Single output for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [43]:
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [44]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_split=0.2, batch_size=1024)

Epoch 1/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 82ms/step - loss: 977984576.0000 - mae: 18226.4551 - val_loss: 58041080.0000 - val_mae: 6530.4712
Epoch 2/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 88ms/step - loss: 57982876.0000 - mae: 6526.6133 - val_loss: 57768116.0000 - val_mae: 6518.2285
Epoch 3/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 95ms/step - loss: 57869552.0000 - mae: 6515.0156 - val_loss: 60295360.0000 - val_mae: 6619.5435
Epoch 4/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 97ms/step - loss: 57911552.0000 - mae: 6516.3174 - val_loss: 57918528.0000 - val_mae: 6526.0029
Epoch 5/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 95ms/step - loss: 58019920.0000 - mae: 6523.4565 - val_loss: 58122068.0000 - val_mae: 6535.0913


In [45]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 4ms/step - loss: 57902408.0000 - mae: 6520.3755
Test MAE: 6518.67578125
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step
Mean Squared Error (MSE): 57921278.435510315
Root Mean Squared Error (RMSE): 7610.603027061017
R-Squared (R²): -0.024338994539614633


In [47]:
# Build the neural network model_2
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [46]:
history = model.fit(X_train, y_train, epochs=5, validation_split=0.2, batch_size=1024)

Epoch 1/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 93ms/step - loss: 58171020.0000 - mae: 6528.1011 - val_loss: 60102476.0000 - val_mae: 6612.6299
Epoch 2/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 113ms/step - loss: 58087672.0000 - mae: 6529.1738 - val_loss: 57684116.0000 - val_mae: 6515.7339
Epoch 3/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 113ms/step - loss: 57995584.0000 - mae: 6523.6289 - val_loss: 58215852.0000 - val_mae: 6535.2710
Epoch 4/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 94ms/step - loss: 57911000.0000 - mae: 6517.8945 - val_loss: 58086216.0000 - val_mae: 6533.4834
Epoch 5/5
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 100ms/step - loss: 57831652.0000 - mae: 6513.2549 - val_loss: 57819208.0000 - val_mae: 6521.5483


In [49]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [50]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 7ms/step - loss: 6865763840.0000 - mae: 82517.7734
Test MAE: 82496.875
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 5ms/step
Mean Squared Error (MSE): 6862288089.032071
Root Mean Squared Error (RMSE): 82838.92858452523
R-Squared (R²): -120.35970529702196


In [61]:
# Build the neural network model_2
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(1)  
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [63]:
history = model.fit(X_train, y_train, epochs=7, validation_split=0.2, batch_size=1024)

Epoch 1/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - loss: 4041977088.0000 - mae: 55141.4961 - val_loss: 79843984.0000 - val_mae: 7378.1045
Epoch 2/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 72604776.0000 - mae: 7096.4946 - val_loss: 61214812.0000 - val_mae: 6653.3408
Epoch 3/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 60110144.0000 - mae: 6607.8193 - val_loss: 58142384.0000 - val_mae: 6534.3335
Epoch 4/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 57774040.0000 - mae: 6513.9980 - val_loss: 57425600.0000 - val_mae: 6507.0176
Epoch 5/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 57198192.0000 - mae: 6490.5103 - val_loss: 57215880.0000 - val_mae: 6498.4062
Epoch 6/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 40ms/step - loss: 56998884.0000 - mae: 6480

In [62]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [64]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - loss: 56936864.0000 - mae: 6479.9248
Test MAE: 6477.03076171875
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step
Mean Squared Error (MSE): 56905464.829637855
Root Mean Squared Error (RMSE): 7543.571092634963
R-Squared (R²): -0.0063743101302815575


In [65]:
import math 

# Custom quadratic activation function
def quadratic_activation(x):
    return tf.math.square(x)

# Register the custom activation function so that it can be used with Keras models
tf.keras.utils.get_custom_objects().update({'quadratic_activation': layers.Activation(quadratic_activation)})

# Build the neural network model with a quadratic output layer
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(1, activation=quadratic_activation)  # Custom quadratic activation function
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Print the model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [66]:
history = model.fit(X_train, y_train, epochs=7, validation_split=0.2, batch_size=1024)

Epoch 1/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 38ms/step - loss: 1423126784.0000 - mae: 23841.0391 - val_loss: 58777284.0000 - val_mae: 6558.4883
Epoch 2/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37ms/step - loss: 58368492.0000 - mae: 6536.9033 - val_loss: 58535816.0000 - val_mae: 6546.7798
Epoch 3/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 37ms/step - loss: 58288096.0000 - mae: 6532.2832 - val_loss: 59706804.0000 - val_mae: 6593.3276
Epoch 4/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - loss: 58506628.0000 - mae: 6541.1636 - val_loss: 58467860.0000 - val_mae: 6545.6772
Epoch 5/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 58434436.0000 - mae: 6535.5137 - val_loss: 58505912.0000 - val_mae: 6548.6470
Epoch 6/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - loss: 58570888.0000 - mae: 6545

In [67]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 58373912.0000 - mae: 6538.5522
Test MAE: 6535.275390625
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step
Mean Squared Error (MSE): 58331051.22393701
Root Mean Squared Error (RMSE): 7637.476757669186
R-Squared (R²): -0.031585834689285


In [68]:
import math 

# Custom quadratic activation function
def quadratic_activation(x):
    return tf.math.square(x)

# Register the custom activation function so that it can be used with Keras models
tf.keras.utils.get_custom_objects().update({'quadratic_activation': layers.Activation(quadratic_activation)})

# Build the neural network model with a quadratic output layer
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(1, activation=quadratic_activation)  # Custom quadratic activation function
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Print the model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [69]:
history = model.fit(X_train, y_train, epochs=7, validation_split=0.2, batch_size=1024)

Epoch 1/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 32ms/step - loss: 4201751552.0000 - mae: 56224.8086 - val_loss: 73488416.0000 - val_mae: 7125.0347
Epoch 2/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - loss: 66532076.0000 - mae: 6859.6108 - val_loss: 58941100.0000 - val_mae: 6564.7524
Epoch 3/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33ms/step - loss: 58318056.0000 - mae: 6535.7568 - val_loss: 57758460.0000 - val_mae: 6520.2939
Epoch 4/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33ms/step - loss: 57493484.0000 - mae: 6503.6807 - val_loss: 57437472.0000 - val_mae: 6506.4478
Epoch 5/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 33ms/step - loss: 57333852.0000 - mae: 6498.0752 - val_loss: 57512580.0000 - val_mae: 6508.8330
Epoch 6/7
[1m1007/1007[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 36ms/step - loss: 57287496.0000 - mae: 6496

In [70]:
# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - loss: 57136916.0000 - mae: 6490.4751
Test MAE: 6486.2109375
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step
Mean Squared Error (MSE): 57109248.87607326
Root Mean Squared Error (RMSE): 7557.066155332588
R-Squared (R²): -0.009978235162103388


In [16]:

from tensorflow.keras.optimizers import Adam

# Custom quadratic activation function
def quadratic_activation(x):
    return tf.math.square(x)

# Register the custom activation function so that it can be used with Keras models
tf.keras.utils.get_custom_objects().update({'quadratic_activation': layers.Activation(quadratic_activation)})

# Build the neural network model with a quadratic output layer
model = models.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1024, activation='relu'),
    layers.Dense(1, activation=quadratic_activation)  # Custom quadratic activation function
])

# Compile the model


# Example: Reduce learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse', metrics=['mae'])


# Print the model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
# Train for more epochs
history = model.fit(X_train, y_train, epochs=5, validation_split=0.2, batch_size=2048)

Epoch 1/5
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 362ms/step - loss: 2458944000.0000 - mae: 38436.5352 - val_loss: 115412208.0000 - val_mae: 8625.6738
Epoch 2/5
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 879ms/step - loss: 92042504.0000 - mae: 7806.9277 - val_loss: 63678028.0000 - val_mae: 6751.6753
Epoch 3/5
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 243ms/step - loss: 61755804.0000 - mae: 6667.9126 - val_loss: 59560956.0000 - val_mae: 6588.8887
Epoch 4/5
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 162ms/step - loss: 59021152.0000 - mae: 6563.3027 - val_loss: 58569560.0000 - val_mae: 6548.7417
Epoch 5/5
[1m504/504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 152ms/step - loss: 58248260.0000 - mae: 6534.5151 - val_loss: 58060668.0000 - val_mae: 6528.5205


In [19]:

test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {test_mae}")

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-Squared
r2 = r2_score(y_test, y_pred)
print(f"R-Squared (R²): {r2}")

[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 4ms/step - loss: 58011980.0000 - mae: 6525.9419
Test MAE: 6518.4873046875
[1m10066/10066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 4ms/step
Mean Squared Error (MSE): 57908867.622697994
Root Mean Squared Error (RMSE): 7609.7876200783685
R-Squared (R²): -0.02354671165317579


In [33]:
import pickle

In [34]:
import pickle

# Assuming 'model' is your trained model
with open('Data/model_salary.pkl', 'wb') as file:
    pickle.dump(model, file)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1610462 entries, 0 to 1610461
Data columns (total 65 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Company Size           1610462 non-null  int64  
 1   Job Title              1610462 non-null  object 
 2   min_experience         1610462 non-null  int64  
 3   max_experience         1610462 non-null  int64  
 4   average_experience     1610462 non-null  float64
 5   average_salary         1610462 non-null  float64
 6   Level                  1610462 non-null  int64  
 7   feature_1              1610462 non-null  float64
 8   feature_2              1610462 non-null  float64
 9   feature_3              1610462 non-null  float64
 10  feature_4              1610462 non-null  float64
 11  feature_5              1610462 non-null  float64
 12  feature_6              1610462 non-null  float64
 13  feature_7              1610462 non-null  float64
 14  feature_8         

In [21]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [22]:
X = df.drop('average_salary', axis=1)  
y = df['average_salary'] 

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
import pandas as pd

# Assuming 'df' is your original dataframe
job_title_encoded = pd.get_dummies(df['Job Title'], prefix='Job_Title')

# Display the first few rows of the encoded 'Job Title' columns
print(job_title_encoded.head())

# If you want to combine this with your original dataframe
df_encoded = pd.concat([df, job_title_encoded], axis=1)

# Drop the original 'Job Title' column, as it's now encoded
df_encoded.drop('Job Title', axis=1, inplace=True)

# Display the first few rows of the updated dataframe
print(df_encoded.head())


   Job_Title_Account Director  Job_Title_Account Executive  \
0                       False                        False   
1                       False                        False   
2                       False                        False   
3                       False                        False   
4                       False                        False   

   Job_Title_Account Manager  Job_Title_Accountant  \
0                      False                 False   
1                      False                 False   
2                      False                 False   
3                      False                 False   
4                      False                 False   

   Job_Title_Administrative Assistant  Job_Title_Aerospace Engineer  \
0                               False                         False   
1                               False                         False   
2                               False                         False   
3                 

In [29]:
X = df_encoded.drop('average_salary', axis=1)  
y = df_encoded['average_salary'] 

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [31]:
from sklearn.linear_model import LinearRegression

# Initialize the linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

In [32]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Make predictions on the training or test data
y_pred = model.predict(X_test)

# Calculate R² score
r2 = r2_score(y_test, y_pred)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print(f'R² score: {r2}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')


R² score: -0.0001589867462892247
Mean Absolute Error (MAE): 6476.823481633577
Mean Squared Error (MSE): 56690086.59440579
Root Mean Squared Error (RMSE): 7529.281944143531
