In [1]:
# import the libraries and dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
# import the CSV data
cities_df = pd.read_csv('C:/Users/Sikkim/Desktop/Python Project/cities.csv', header=0)
colleges_df = pd.read_csv('C:/Users/Sikkim/Desktop/Python Project/Colleges.csv', header=0)
employees_df = pd.read_csv('C:/Users/Sikkim/Desktop/Python Project/employees.csv', header=0)

In [4]:
cities_df.head()

Unnamed: 0,Metrio City,non-metro cities
0,Mumbai,Dehradun
1,Delhi,Durgapur
2,Kolkata,Asansol
3,Chennai,Rourkela
4,Bangalore,Kozhikode


In [5]:
colleges_df.head()

Unnamed: 0,Tier 1,Tier 2,Tier 3
0,IIT Bombay,IIIT Bangalore,"Ramaiah Institute of Technology, Bengaluru"
1,IIT Delhi,IIIT Delhi,TIET/Thapar University
2,IIT Kharagpur,IGDTUW,Manipal Main Campus
3,IIT Madras,NIT Calicut,VIT Vellore
4,IIT Kanpur,IIITM Gwalior,SRM Main Campus


In [6]:
employees_df.head()

Unnamed: 0,College,City,Role,Previous CTC,Previous job change,Graduation Marks,EXP (Month),CTC
0,SVNIT Surat,Asansol,Manager,55523,3,66,19,71406.58
1,NIT Bhopal,Ajmer,Executive,57081,1,84,18,68005.87
2,"IEM, Kolkata",Rajpur Sonarpur,Executive,60347,2,52,28,76764.02
3,"KIIT, Bhubaneswar",Ajmer,Executive,49010,2,81,33,82092.39
4,DTU,Durgapur,Executive,57879,4,74,32,73878.1


In [7]:
#convert the categorical data into numerical data such that 0 goes for non-metro and 1 for metro cities

In [8]:
city_dict = {}

for city in cities_df['Metrio City'].dropna():
    city_dict[city] = 1
for city in cities_df['non-metro cities'].dropna():
    city_dict[city] = 0

In [9]:
city_dict

{'Mumbai': 1,
 'Delhi': 1,
 'Kolkata': 1,
 'Chennai': 1,
 'Bangalore': 1,
 'Hyderabad': 1,
 'Ahmedabad': 1,
 'Pune': 1,
 'Surat': 1,
 'Dehradun': 0,
 'Durgapur': 0,
 'Asansol': 0,
 'Rourkela': 0,
 'Kozhikode': 0,
 'Rajpur Sonarpur': 0,
 'Kolhapur': 0,
 'Ajmer': 0}

In [10]:
# converting the college names into numerial formats 

In [11]:
college_dict = {}
for college in colleges_df['Tier 1'].dropna():
    college_dict[college] = 1
for college in colleges_df['Tier 2'].dropna():
    college_dict[college] = 2
for college in colleges_df['Tier 3'].dropna():
    college_dict[college] = 3

In [12]:
college_dict

{'IIT Bombay': 1,
 'IIT Delhi': 1,
 'IIT Kharagpur': 1,
 'IIT Madras': 1,
 'IIT Kanpur': 1,
 'IIT Roorkee': 1,
 'IIT Guwahati': 1,
 'IIIT Hyderabad': 1,
 'BITS Pilani (Pilani Campus)': 1,
 'IIT Indore': 1,
 'IIT Ropar': 1,
 'IIT BHU (Varanasi)': 1,
 'IIT ISM Dhanbad': 1,
 'DTU': 1,
 'NSUT Delhi (NSIT)': 1,
 'NIT Tiruchipally (Trichy)': 1,
 'NIT Warangal': 1,
 'NIT Surathkal (Karnataka)': 1,
 'Jadavpur University': 1,
 'BITS Pilani (Hyderabad Campus)': 1,
 'BITS Pilani (Goa Campus)': 1,
 'IIIT Allahabad': 1,
 'IIIT Bangalore': 2,
 'IIIT Delhi': 2,
 'IGDTUW': 2,
 'NIT Calicut': 2,
 'IIITM Gwalior': 2,
 'IIIT Lucknow': 2,
 'MNNIT Allahabad': 2,
 'Punjab Engineering College': 2,
 'DAIICT': 2,
 'MNIT Jaipur': 2,
 'NIT Durgapur': 2,
 'VNIT Nagpur': 2,
 'LNMIIT': 2,
 'BIT Mesra': 2,
 'SVNIT Surat': 2,
 'NIT Jalandhar': 2,
 'NIT Jamshedpur': 2,
 'NIT Kurukshetra': 2,
 'NIT Patna': 2,
 'NIT Raipur': 2,
 'NIT Bhopal': 2,
 'NIT Rourkela': 2,
 'NIT Silchar': 2,
 'NIT Sikkim': 2,
 'IIIT Jabalpur': 

In [13]:
employees_df['College'] = employees_df['College'].map(college_dict)

In [14]:
# convert the role into numurical values

In [15]:
employees_df['Role'] = employees_df['Role'].map({'Executive': 0, 'Manager': 1})

In [16]:
# convert cities also into numuric values

In [17]:
employees_df['City'] = employees_df['City'].map(city_dict)

In [18]:
employees_df.head()

Unnamed: 0,College,City,Role,Previous CTC,Previous job change,Graduation Marks,EXP (Month),CTC
0,2,0,1,55523,3,66,19,71406.58
1,2,0,0,57081,1,84,18,68005.87
2,3,0,0,60347,2,52,28,76764.02
3,3,0,0,49010,2,81,33,82092.39
4,1,0,0,57879,4,74,32,73878.1


In [20]:
employees_df.shape

(1589, 8)

In [21]:
employees_df.info

<bound method DataFrame.info of       College  City  Role  Previous CTC  Previous job change  \
0           2     0     1         55523                    3   
1           2     0     0         57081                    1   
2           3     0     0         60347                    2   
3           3     0     0         49010                    2   
4           1     0     0         57879                    4   
...       ...   ...   ...           ...                  ...   
1584        1     1     0         61285                    3   
1585        1     0     0         63140                    1   
1586        1     1     0         44907                    1   
1587        1     0     0         52054                    2   
1588        2     1     1         44353                    3   

      Graduation Marks  EXP (Month)       CTC  
0                   66           19  71406.58  
1                   84           18  68005.87  
2                   52           28  76764.02  
3      

In [None]:
# Drop rows with missing values

In [22]:
employees_df = employees_df.dropna()

In [23]:
employees_df.shape

(1589, 8)

In [24]:
# Training the data based on CTC

In [25]:
# Selecting features and target variable that is CTC
X = employees_df.drop('CTC', axis=1)
y = employees_df['CTC']  

In [35]:
# splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Linear Regression

In [32]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [33]:
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [34]:
print(f'Linear Regression - Mean Squared Error: {mse_lr}')
print(f'Linear Regression - Root Mean Squared Error: {rmse_lr}')
print(f'Linear Regression - R-squared: {r2_lr}')

Linear Regression - Mean Squared Error: 77362774.9495656
Linear Regression - Root Mean Squared Error: 8795.611118595774
Linear Regression - R-squared: 0.593351795838508


In [None]:
# Random Forest Regression

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [38]:
rf_model = RandomForestRegressor(random_state=42)

In [39]:
# Grid Search for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [40]:
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [41]:
best_rf_model = grid_search.best_estimator_

In [43]:
# Predict and evaluate values
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest - Mean Squared Error: {mse_rf}')
print(f'Random Forest - Root Mean Squared Error: {rmse_rf}')
print(f'Random Forest - R-squared: {r2_rf}')

Random Forest - Mean Squared Error: 50867855.85858002
Random Forest - Root Mean Squared Error: 7132.170487206544
Random Forest - R-squared: 0.6571886810415493


In [44]:
# save the model
import joblib
joblib.dump(best_rf_model, 'salary_prediction_model.pkl')

['salary_prediction_model.pkl']

In [None]:
# Load the model


In [45]:
model = joblib.load('salary_prediction_model.pkl')

In [46]:
new_employee_data =np.array([2, 0, 1, 10000, 1, 69, 6]).reshape(1, -1)

In [47]:
new_employee_scaled = scaler.transform(new_employee_data)



In [48]:
# Predict the salary for fuuture


In [49]:
predicted_salary = model.predict(new_employee_scaled)

In [50]:
print("Predicted salary:", predicted_salary)

Predicted salary: [72373.19672351]


In [51]:
 #1 Your views about the problem statement?
    """Predicting the future salary of an employee based on various factors such as city, current role, current total 
    compensation (CTC), years of experience, and education can provide valuable insights into potential salary trends. 
    By analyzing historical data of employees with similar profiles and tracking their career progression within the company
    or industry, it's possible to identify patterns and correlations that can predict the salary.

IndentationError: unexpected indent (1321275048.py, line 2)

In [None]:
#2 What will be your approach to solving this task?
""""" My approach involves data preprocessing, feature engineering, model selection, training, evaluation, and optimization
to predict employee salaries.""""

#3 What were the available ML model options you had to perform this task?
""""""The available options included Linear Regression, Ridge Regression, Lasso Regression, Decision Tree Regressor,
Random Forest Regressor, Gradient Boosting Regressor, and XGBoost Regressor, but i have used linear regression and random forest""""""

#4 Which model’s performance is best and what could be the possible reason for that?
"""""" Random Forest Regressor performed the best due to its ability to handle non-linear relationships and interactions 
between features, so i have used Random Forest to perform the task.

# 5 What steps can you take to improve this selected model’s performance even further?
"""""" Improvements can include more feature engineering, hyperparameter tuning with GridSearchCV, and using ensemble methods 
like stacking.

#6.	Steps required before training the model: 
Gather historical data on employee salaries, including features such as qualifications, experience, performance ratings, 
education level, past job experience, certifications, location, and role, which has been provided in CSV files.
After understanding the datasets and their relationships, load the CSV files into Jupyter Notebook. Identify missing
values in the dataset. Convert the 'college' field into numerical data using tiers.
