In [1]:
# Author: Priti Gupta
# Date: June 13th, 2023
# Description: Scrapping data from glassdoor to analyse salaries of data science positions in India
# GitHub: https://github.com/PritiG1/DS-SalaryPredictor

import numpy as np
import pandas as pd


df = pd.read_csv('FE_naukri_salary.csv')

In [2]:
pd.set_option('display.max_rows',None)
df = df[df['Location']!='Misc']
df = df[df['Job title']!='na']
model_df = df[['Job title','avg Experience (yrs)','avg salary (L)']]
#model_df = df[['Job title','avg salary (L)']]
# dataset for model building
model_df.head()

Unnamed: 0,Job title,avg Experience (yrs),avg salary (L)
0,Data Scientist,6.0,9.5
1,Data Scientist,11.5,24.0
2,Data Scientist,9.5,19.0
3,Data Scientist,6.0,16.0
4,Data Scientist,3.5,20.0


In [3]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#dividing dataset
X = model_df.iloc[:, :-1].values
y = model_df.iloc[:, -1].values

#encoding categorical data

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X =(ct.fit_transform(X)).toarray()


## Multiple linear regression

In [12]:
# splitting dataset into training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

# training the model on the training set
from sklearn.linear_model import LinearRegression
regressor_mlr = LinearRegression()
regressor_mlr.fit(X_train, y_train)

#predicting the test results
y_pred = regressor_mlr.predict(X_test)
np.set_printoptions(precision=2)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.46686647570025164

## Polynomial linear regression

In [7]:
# splitting dataset into training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

#training the model

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor_plr = LinearRegression()
regressor_plr.fit(X_poly, y_train)

#predicting results
y_pred = regressor_plr.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
#print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.36168648155264593

## random forest regression

In [8]:
# splitting dataset into training set and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

#training the model

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor_rf.fit(X_train, y_train)

#predicting
y_pred = regressor_rf.predict(X_test)
np.set_printoptions(precision=2)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.3929043157455375