In [None]:
#ASS14. Develop a Linear Regression model to estimate IT professionals’ salaries based on experience,
#education, and skills. Evaluate performance using 5-Fold Cross-Validation.

# ================================================================
# IT Professionals' Salary Prediction using Linear Regression
# Features: Experience, Education, Skills
# Validation: 5-Fold Cross Validation
# ================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# ---------------------------------------------------------------
# 1. Load Dataset
# ---------------------------------------------------------------
df = pd.read_excel("salary.xlsx")
print("Dataset Loaded Successfully! Shape:", df.shape)
print(df.head())

# ---------------------------------------------------------------
# 2. Data Preprocessing
# ---------------------------------------------------------------
# Drop irrelevant or highly specific columns (like Name/ID if any)
df = df.dropna()  # Remove missing values if present

# Encode categorical columns (Gender, Education Level, Job Title)
le = LabelEncoder()
for col in ["Gender", "Education Level", "Job Title"]:
    df[col] = le.fit_transform(df[col])

# ---------------------------------------------------------------
# 3. Define Features (X) and Target (y)
# ---------------------------------------------------------------
# We'll use Experience, Education, and Job Title (skill proxy) as predictors
X = df[["Years of Experience", "Education Level", "Job Title"]]
y = df["Salary"]

# ---------------------------------------------------------------
# 4. Train-Test Split
# ---------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------------------------------------------------
# 5. Train Linear Regression Model
# ---------------------------------------------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# ---------------------------------------------------------------
# 6. Predict and Evaluate on Test Data
# ---------------------------------------------------------------
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n--- Model Evaluation (Single Train-Test Split) ---")
print(f"R² Score : {r2:.4f}")
print(f"MAE      : {mae:.2f}")
print(f"RMSE     : {rmse:.2f}")

# ---------------------------------------------------------------
# 7. 5-Fold Cross Validation
# ---------------------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_r2 = cross_val_score(model, X, y, cv=kf, scoring='r2')
cv_mae = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_rmse = np.sqrt(-cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error'))

print("\n--- 5-Fold Cross Validation Results ---")
print("R² per fold:", np.round(cv_r2, 4))
print("MAE per fold:", np.round(cv_mae, 2))
print("RMSE per fold:", np.round(cv_rmse, 2))

print("\nAverage CV Results:")
print(f"Average R²  : {np.mean(cv_r2):.4f}")
print(f"Average MAE : {np.mean(cv_mae):.2f}")
print(f"Average RMSE: {np.mean(cv_rmse):.2f}")

# ---------------------------------------------------------------
# 8. Visualization — Actual vs Predicted Salaries
# ---------------------------------------------------------------
plt.figure(figsize=(6,5))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title("Actual vs Predicted Salaries (IT Professionals)")
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.grid(True)
plt.tight_layout()
plt.show()

# ---------------------------------------------------------------
# 9. Forecast Example
# ---------------------------------------------------------------
new_data = pd.DataFrame({
    "Years of Experience": [10],
    "Education Level": [le.fit_transform(["Master's"])[0]],
    "Job Title": [le.fit_transform(["Software Engineer"])[0]]
})
forecast = model.predict(new_data)
print("\nPredicted Salary for 10 years experience (Master’s, Software Engineer):", round(forecast[0], 2))