In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load the dataset
file_path = "Employee_Salary_Dataset.csv"  # Replace with the actual path
data = pd.read_csv(file_path)

# Drop unnecessary columns
data = data.drop(columns=["ID"])

# One-hot encode the 'Gender' column
encoder = OneHotEncoder(drop="first", sparse_output=False)  # Updated parameter
gender_encoded = encoder.fit_transform(data[["Gender"]])
gender_encoded_df = pd.DataFrame(gender_encoded, columns=encoder.get_feature_names_out(["Gender"]))

# Combine the encoded gender data with the main dataset
data_encoded = pd.concat([data.drop(columns=["Gender"]), gender_encoded_df], axis=1)

# Split the data into features and target variable
X = data_encoded.drop(columns=["Salary"])
y = data_encoded["Salary"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a Random Forest Regressor model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Save the model and encoder for deployment
joblib.dump(model, "employee_salary_model.pkl")
joblib.dump(encoder, "gender_encoder.pkl")

print("Model and encoder saved successfully.")
# Predefined list of new employees
new_employees = pd.DataFrame({
    "Experience_Years": [3, 10],
    "Age": [25, 45],
    "Gender": ["Male", "Female"]
})

# Preprocess and predict
gender_encoded = encoder.transform(new_employees[["Gender"]])
gender_encoded_df = pd.DataFrame(gender_encoded, columns=encoder.get_feature_names_out(["Gender"]))
new_employees_encoded = pd.concat([new_employees.drop(columns=["Gender"]), gender_encoded_df], axis=1)
predicted_salaries = model.predict(new_employees_encoded)

# Display results
new_employees["Predicted_Salary"] = predicted_salaries
print(new_employees)




Mean Absolute Error: 402659.5714285714
R-squared: -38.166702312005455
Model and encoder saved successfully.
   Experience_Years  Age  Gender  Predicted_Salary
0                 3   25    Male      2.325500e+05
1                10   45  Female      3.583432e+06
