
# Employee Salary Prediction using Linear Regression (Large Dataset)

This notebook creates a large synthetic dataset and builds a salary prediction model using Linear Regression.


In [None]:

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

# Step 2: Generate Large Synthetic Dataset
np.random.seed(42)
num_samples = 500

experience = np.random.randint(0, 21, num_samples)
education_levels = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], num_samples)
job_titles = np.random.choice(['Data Analyst', 'Software Engineer', 'Data Scientist', 'Manager', 'HR'], num_samples)
locations = np.random.choice(['Bangalore', 'Delhi', 'Mumbai', 'Hyderabad', 'Chennai'], num_samples)

# Salary generation logic
base_salary = 25000 + (experience * 3000)
education_bonus = {'High School': 0, 'Bachelor': 10000, 'Master': 20000, 'PhD': 30000}
job_bonus = {'Data Analyst': 5000, 'Software Engineer': 15000, 'Data Scientist': 25000, 'Manager': 20000, 'HR': 10000}
location_bonus = {'Bangalore': 5000, 'Delhi': 3000, 'Mumbai': 4000, 'Hyderabad': 3500, 'Chennai': 3200}

salaries = []
for i in range(num_samples):
    salary = base_salary[i]
    salary += education_bonus[education_levels[i]]
    salary += job_bonus[job_titles[i]]
    salary += location_bonus[locations[i]]
    salary += np.random.normal(0, 5000)
    salaries.append(int(salary))

df = pd.DataFrame({
    'Experience': experience,
    'Education': education_levels,
    'Job Title': job_titles,
    'Location': locations,
    'Salary': salaries
})

df.head()


In [None]:

# Step 3: Data Preprocessing
X = df.drop('Salary', axis=1)
y = df['Salary']

categorical_cols = ['Education', 'Job Title', 'Location']

ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(drop='first'), categorical_cols)
], remainder='passthrough')

X_encoded = ct.fit_transform(X)


In [None]:

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [None]:

# Step 5: Train the Model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:

# Step 6: Predict and Evaluate
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


In [None]:

# Step 7: Visualization
plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Actual vs Predicted Salary')
plt.grid(True)
plt.show()
