In [4]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import streamlit as st

In [9]:
#1. Choose a Dataset
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Update the path to your CSV file
# Make sure the file 'kaggle datasets.csv' is in the root of your Google Drive
# Load the datasets
dataset = pd.read_csv('/content/drive/MyDrive/Employers_data.csv')
dataset.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Employee_ID,Name,Age,Gender,Department,Job_Title,Experience_Years,Education_Level,Location,Salary
0,1,Merle Ingram,24,Female,Engineering,Engineer,1,Master,Austin,90000
1,2,John Mayes,56,Male,Sales,Executive,33,Master,Seattle,195000
2,3,Carlos Wille,21,Male,Engineering,Intern,1,Bachelor,New York,35000
3,4,Michael Bryant,30,Male,Finance,Analyst,9,Bachelor,New York,75000
4,5,Paula Douglas,25,Female,HR,Analyst,2,Master,Seattle,70000


In [10]:
# Step 3: Data Preprocessing
# Check for nulls
print(dataset.isnull().sum())

# Encode categorical features
label_encoders = {}
for col in dataset.select_dtypes(include='object').columns:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    label_encoders[col] = le


Employee_ID         0
Name                0
Age                 0
Gender              0
Department          0
Job_Title           0
Experience_Years    0
Education_Level     0
Location            0
Salary              0
dtype: int64


In [15]:
# Scale numeric features
scaler = StandardScaler()
dataset[['Age', 'Experience_Years']] = scaler.fit_transform(dataset[['Age', 'Experience_Years']])

In [16]:
# Step 4: Feature Engineering (optional)
# Not required if dataset is clean and simple

# Step 5: Split Data
X = dataset.drop('Salary', axis=1)
y = dataset['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Step 6: Train Models
lr = LinearRegression()
lr.fit(X_train, y_train)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)



In [18]:
# Step 7: Evaluate Models
def evaluate(model):
    y_pred = model.predict(X_test)
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))

print("Linear Regression Evaluation")
evaluate(lr)

print("Random Forest Regressor Evaluation")
evaluate(rf)


Linear Regression Evaluation
MAE: 11535.57892774409
MSE: 216208726.72552112
RMSE: 14704.037769453706
R2 Score: 0.8974177418548799
Random Forest Regressor Evaluation
MAE: 3618.6
MSE: 20457642.5
RMSE: 4523.012546964688
R2 Score: 0.9902936796504067


In [19]:
# Step 8: Streamlit App
# Save this section as app.py for deployment
"""
import streamlit as st
st.title("Salary Prediction App")

age = st.slider("Age", 20, 60)
experience = st.slider("Experience", 0, 40)
education = st.selectbox("Education Level", list(label_encoders['Education'].classes_))
skill = st.selectbox("Skill Level", list(label_encoders['Skill Level'].classes_))

# Convert input
input_df = pd.DataFrame({
    'Age': [age],
    'Experience': [experience],
    'Education': [label_encoders['Education'].transform([education])[0]],
    'Skill Level': [label_encoders['Skill Level'].transform([skill])[0]]
})

input_df[['Age', 'Experience']] = scaler.transform(input_df[['Age', 'Experience']])

prediction = rf.predict(input_df)[0]
st.success(f"Predicted Salary: ${prediction:,.2f}")
"""

# Step 9: Deployment
# Use 'render.yaml' or Render Dashboard to deploy the app


'\nimport streamlit as st\nst.title("Salary Prediction App")\n\nage = st.slider("Age", 20, 60)\nexperience = st.slider("Experience", 0, 40)\neducation = st.selectbox("Education Level", list(label_encoders[\'Education\'].classes_))\nskill = st.selectbox("Skill Level", list(label_encoders[\'Skill Level\'].classes_))\n\n# Convert input\ninput_df = pd.DataFrame({\n    \'Age\': [age],\n    \'Experience\': [experience],\n    \'Education\': [label_encoders[\'Education\'].transform([education])[0]],\n    \'Skill Level\': [label_encoders[\'Skill Level\'].transform([skill])[0]]\n})\n\ninput_df[[\'Age\', \'Experience\']] = scaler.transform(input_df[[\'Age\', \'Experience\']])\n\nprediction = rf.predict(input_df)[0]\nst.success(f"Predicted Salary: ${prediction:,.2f}")\n'