In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:
df = pd.read_csv("Salary_dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

Handle Categorical Features

In [None]:
# If any categorical column exists, convert using get_dummies
df = pd.get_dummies(df, drop_first=True)


Features & Target

In [None]:
# Multiple features: Experience + TestScore
X_multi = df[['YearsExperience']] # Corrected 'Experience' to 'YearsExperience' and removed non-existent 'TestScore'

# Single feature: Experience only
X_single = df[['YearsExperience']] # Corrected 'Experience' to 'YearsExperience'

# Target
y = df['Salary']

Train/Test Split


In [None]:
X_train_multi, X_test_multi, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)
X_train_single, X_test_single, _, _ = train_test_split(X_single, y, test_size=0.2, random_state=42)


Train Linear Regression Models

In [None]:
# Multiple Feature Model
multi_model = LinearRegression()
multi_model.fit(X_train_multi, y_train)

# Single Feature Model
single_model = LinearRegression()
single_model.fit(X_train_single, y_train)


Evaluate Models

In [None]:
# Predictions
y_pred_multi = multi_model.predict(X_test_multi)
y_pred_single = single_model.predict(X_test_single)

# RMSE
rmse_multi = np.sqrt(mean_squared_error(y_test, y_pred_multi))
rmse_single = np.sqrt(mean_squared_error(y_test, y_pred_single))

# R² Score
r2_multi = r2_score(y_test, y_pred_multi)
r2_single = r2_score(y_test, y_pred_single)

print(f"Multiple Feature Model -> RMSE: {rmse_multi:.2f}, R²: {r2_multi:.2f}")
print(f"Single Feature Model -> RMSE: {rmse_single:.2f}, R²: {r2_single:.2f}")


Compare and Save Best Model

In [None]:
if r2_multi > r2_single:
    best_model = multi_model
    print("Best Model: Multiple Feature Model")
else:
    best_model = single_model
    print("Best Model: Single Feature Model")

# Save the best model
joblib.dump(best_model, "BestSalaryModel.pkl")
