In [16]:
import pandas as pd

df = pd.read_csv("salaries.csv")
print(df.shape)
df.head()


(10000, 15)


Unnamed: 0,age,workclass,education,marital-status,occupation,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,years_experience,skills,salary_in_inr
0,29,Private,Associate,Divorced,Data Analyst,Black,Male,1679,1385,53,Australia,<=50K,13,"SQL, Python, Tableau",923132
1,36,Local-gov,Some-college,Never-married,Data Analyst,Other,Female,3611,919,48,Canada,>50K,0,"Excel, Tableau, Python",700000
2,31,Self-emp-not-inc,PhD,Never-married,ML Engineer,Amer-Indian-Eskimo,Male,5881,1735,41,Australia,<=50K,1,"Pandas, Python, Scikit-learn",1012582
3,57,Self-emp-inc,Associate,Widowed,Web Developer,Other,Male,1139,93,51,United States,>50K,2,"CSS, HTML, Node.js",638216
4,51,State-gov,PhD,Married-civ-spouse,Web Developer,Asian-Pac-Islander,Male,4374,1437,59,China,>50K,19,"CSS, Node.js, HTML",1077793


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Drop unused columns if needed
df.drop(columns=['income'], errors='ignore', inplace=True)

# One-hot encode categorical columns
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'race', 'gender', 'native-country']
df = pd.get_dummies(df, columns=categorical_cols)

# Extract skills
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
skills_matrix = vectorizer.fit_transform(df['skills'])
skills_df = pd.DataFrame(skills_matrix.toarray(), columns=[f"skill_{s}" for s in vectorizer.get_feature_names_out()])

df.reset_index(drop=True, inplace=True)
skills_df.reset_index(drop=True, inplace=True)

df = pd.concat([df.drop('skills', axis=1), skills_df], axis=1)




In [18]:
X = df.drop(columns=["salary_in_inr"])
y = df["salary_in_inr"]


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results.append({"Model": name, "R2 Score": r2, "MAE": mae, "RMSE": rmse})

results_df = pd.DataFrame(results).sort_values(by="R2 Score", ascending=False)
print(results_df)


               Model  R2 Score            MAE           RMSE
0  Linear Regression  0.931964   49689.849670   66491.470134
2  Gradient Boosting  0.930499   50211.310797   67203.586224
1      Random Forest  0.925690   51031.817090   69490.012048
3      Decision Tree  0.858047   67456.544500   96044.081680
4                KNN -0.187821  230329.240200  277826.308187


In [21]:
import joblib

# Replace with best model name from above
best_model = LinearRegression()
best_model.fit(X_train, y_train)

joblib.dump(best_model, "salary_model.pkl")
joblib.dump(X.columns.tolist(), "model_features.pkl")


['model_features.pkl']

In [23]:
!pip install xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

search = RandomizedSearchCV(xgb, param_distributions=param_grid,
                             scoring='r2', n_iter=10, cv=3, verbose=1, random_state=42)
search.fit(X_train, y_train)

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Best R2 Score: {r2_score(y_test, y_pred):.4f}")


Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/150.0 MB 693.0 kB/s eta 0:03:36
   ---------------------------------------- 0.5/150.0 MB 693.0 kB/s eta 0:03:36
   ---------------------------------------- 0.5/150.0 MB 693.0 kB/s eta 0:03:36
   ---------------------------------------- 0.8/150.0 MB 521.8 kB/s eta 0:04:46
   ---------------------------------------- 1.0/150.0 MB 607.1 kB/s eta 0:04:06
   ---------------------------------------- 1.0/150.0 MB 607.1 kB/s eta 0:04:06
   -------------

In [24]:
joblib.dump(best_model, "salary_model1.pkl")
joblib.dump(X.columns.tolist(), "model_features1.pkl")


['model_features1.pkl']