In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:
url = 'https://raw.githubusercontent.com/Swathi-2003/Employee-salary-prediction/main/Salary.csv'
df = pd.read_csv(url)
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
df.isnull().sum()


In [None]:
df.duplicated().sum()


In [None]:
df.drop_duplicates(inplace=True)
df.shape


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
sns.pairplot(df)
plt.show()


In [None]:
X = df[['YearsExperience']]
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}')
print(f'R2 Score: {r2:.2f}')


In [None]:
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.title('Actual vs Predicted Salary')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.legend()
plt.show()


In [None]:
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("Cross-Validation R² scores:", scores)
print("Average R²:", scores.mean())


In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted Salary")
plt.ylabel("Residuals")
plt.title("Residual Plot")
plt.show()


In [None]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_poly, y, test_size=0.2, random_state=42)

poly_model = LinearRegression()
poly_model.fit(X_train_p, y_train_p)

y_poly_pred = poly_model.predict(X_test_p)

print("Polynomial Regression R2 Score:", r2_score(y_test_p, y_poly_pred))


In [None]:
joblib.dump(model, "salary_prediction_model.pkl")
