# Employee Salary Prediction - Final Project

This notebook contains the full workflow for predicting employee salaries using machine learning algorithms.

In [None]:
# 📦 Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 📂 Loading the dataset
df = pd.read_csv("employee_salary_data.csv")
df.head()

## 🔍 Step 1: Exploratory Data Analysis

In [None]:
df.info()
df.describe()
df.isnull().sum()

# Visualizing categorical features
sns.countplot(data=df, x='Education')
plt.xticks(rotation=45)
plt.title("Education Level Distribution")
plt.show()

# Salary distribution
sns.histplot(df['Salary'], kde=True)
plt.title("Salary Distribution")
plt.show()

## 🧹 Step 2: Data Preprocessing

In [None]:
# Encoding categorical variables
label_encoders = {}
categorical_columns = ['Gender', 'Education', 'Job Title']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature and target selection
X = df.drop('Salary', axis=1)
y = df['Salary']

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 🤖 Step 3: Model Training and Evaluation

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_preds = lr.predict(X_test_scaled)

# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Evaluation Function
def evaluate(model_name, y_true, y_pred):
    print(f"{model_name} - R² Score: {r2_score(y_true, y_pred):.2f}, MSE: {mean_squared_error(y_true, y_pred):.2f}")

evaluate("Linear Regression", y_test, lr_preds)
evaluate("Decision Tree", y_test, dt_preds)
evaluate("Random Forest", y_test, rf_preds)

## 📈 Step 4: Final Model & User Input Prediction

In [None]:
# Let's use Random Forest (highest accuracy)
final_model = rf

# Sample user input
sample_data = {
    'Gender': label_encoders['Gender'].transform(['Male'])[0],
    'Education': label_encoders['Education'].transform(['Master’s'])[0],
    'Job Title': label_encoders['Job Title'].transform(['Data Scientist'])[0],
    'Years of Experience': 5
}

sample_df = pd.DataFrame([sample_data])
predicted_salary = final_model.predict(sample_df)[0]
print(f"Predicted Salary: ₹{predicted_salary:,.2f}")