Importing Libraries

In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
import joblib
import warnings

warnings.filterwarnings('ignore')

Dataset Reading

In [11]:
try:
    data = pd.read_csv('D:\WEB\Student\students.xls')
    print("Data read is done Succesfully....")
except FileNotFoundError:
    print("Check the dataset once....") 

Data read is done Succesfully....


In [12]:
data.head()

Unnamed: 0,college_tier,location,domain,gpa,problem_solving_skill,leetcode_score,programming_skill,technical_skills,project_count,internship_months,current_company_tier,current_companies,current_salary_lpa,expected_company_tier,expected_companies,expected_salary_lpa,extra_hours_needed_daily
0,Tier3,Hyderabad,Full Stack Development,7.77,7,144,6,"AWS, SQL, Python, Node.js",2,6,ServiceCompanies,"HCL, TCS, Infosys",6.28,ServiceCompanies,"Accenture, Cognizant, HCL",9.76,2.9
1,Tier1_Private,Bangalore,UI/UX Design,6.29,3,100,4,"Adobe XD, Figma, Sketch",0,3,ServiceCompanies,"HCL, Accenture",8.05,ServiceCompanies,"TCS, Cognizant, HCL",11.19,4.8
2,Tier3,Bangalore,Web Development,6.0,5,131,5,"MongoDB, HTML/CSS, React, Node.js",2,4,ServiceCompanies,"Accenture, Infosys",6.4,ServiceCompanies,"HCL, Infosys, Cognizant",9.48,3.8
3,Tier2,Other,Data Science,6.0,6,122,2,"Python, SQL, Tableau",2,0,ServiceCompanies,"HCL, TCS, Accenture",8.68,ServiceCompanies,"TCS, Cognizant",13.04,6.3
4,Tier3,Bangalore,Data Science,6.47,3,100,2,"Machine Learning, Python, Tableau",0,4,ServiceCompanies,"HCL, Wipro",6.43,ServiceCompanies,"Cognizant, Wipro",8.94,6.4


In [13]:
data.isna().sum()

college_tier                0
location                    0
domain                      0
gpa                         0
problem_solving_skill       0
leetcode_score              0
programming_skill           0
technical_skills            0
project_count               0
internship_months           0
current_company_tier        0
current_companies           0
current_salary_lpa          0
expected_company_tier       0
expected_companies          0
expected_salary_lpa         0
extra_hours_needed_daily    0
dtype: int64

Data Preprocessing

In [14]:
def preprocess(data, label_encoder):
    data = data.copy()
    data['college_tier_encoded'] = label_encoder.fit_transform(data['college_tier'])
    numerical_cols = ['gpa', 'problem_solving_skill', 'programming_skill',
                      'leetcode_score', 'project_count', 'internship_months']
    for cols in numerical_cols:
        Q1 = data[cols].quantile(0.25)
        Q3 = data[cols].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data[cols] = np.clip(data[cols], lower_bound, upper_bound)

    data['skill_score'] = (data['problem_solving_skill'] * 0.4 +
                         data['programming_skill'] * 0.3 +
                         data['leetcode_score'] / 25 * 0.3)

    return data, label_encoder

The Function mitigate_bias is used to remove the bias of College tiers

In [15]:
def mitigate_bias(data):
    new_data = []
    for tier in data['college_tier'].unique():
        df = data[data['college_tier'] == tier]
        if len(df) > 100:
            new_data.append(resample(df, replace=True, n_samples=150, random_state=42))
        else:
            new_data.append(df)
    return pd.concat(new_data, ignore_index=True)

Dataset Classification into train and test datasets

In [16]:
label_encoder = LabelEncoder()
scaler = StandardScaler()

data_preprocess, label_encoder = preprocess(data, label_encoder)
data_removal_bias = mitigate_bias(data_preprocess)

feature_columns = ['gpa', 'college_tier_encoded', 'problem_solving_skill',
                       'programming_skill', 'leetcode_score', 'project_count',
                       'internship_months', 'skill_score']

x = data_removal_bias[feature_columns]
curr_y = data_removal_bias['current_salary_lpa']
exp_y = data_removal_bias['expected_salary_lpa']

x_scaled = scaler.fit_transform(x)

x_train, x_test, y_curr_train, y_curr_test = train_test_split(
    x_scaled, curr_y, test_size=0.2, random_state = 42, stratify = data_removal_bias['college_tier_encoded']
)

_, _, y_exp_train, y_exp_test = train_test_split(
    x_scaled, exp_y, test_size=0.2, random_state = 42, stratify = data_removal_bias['college_tier_encoded']
)


Random Forest Model Training

In [17]:
def train_model(x, y, model_name):
    param = {
        'n_estimators': [100, 200, 300],
        'max_depth': [15, 20, 25, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }

    rf = RandomForestRegressor(random_state = 42)
    grid_search = GridSearchCV(rf, param, cv=5, scoring='r2', verbose=0)
    print(f"Fitting Grid Search for the model - {model_name}")
    grid_search.fit(x,y)
    print("===== Completed ======")
    return grid_search.best_estimator_

In [18]:
curr_model = train_model(x_train, y_curr_train, "Current Model")
exp_model = train_model(x_train, y_exp_train, "Expected Model")

Fitting Grid Search for the model - Current Model
Fitting Grid Search for the model - Expected Model


Metrics for the Current Model and Expected Model

In [19]:
def checkScore(model, y_test, name, x_test):
    y_pred = model.predict(x_test)
    print(f"\n=====Scores for the model - {name} ========")
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"R-square: {r2}")
    print(f"Mean absolute error: {mae}")
    print(f"Root mean square error: {rmse}")

In [20]:
for model, y_test, name in [(curr_model, y_curr_test, "Current Model"),
    (exp_model, y_exp_test, "Expected Model")]:
    checkScore(model, y_test, name, x_test)


R-square: 0.711725080973312
Mean absolute error: 1.7239632835764556
Root mean square error: 2.594435908982028

R-square: 0.78563056743326
Mean absolute error: 2.2813384777814503
Root mean square error: 3.77363016810244


Prediction Based on the skills

In [21]:
def predict_companies(data):
    data['skill_score'] = (
        data['problem_solving_skill'] * 0.4 + data['programming_skill'] * 0.3 + data['leetcode_score'] / 25 * 0.3
    )
    total_score = (
        data['skill_score'] * 0.5 + data['problem_solving_skill'] * 0.3 + data['programming_skill'] * 0.2 + 
        data['gpa'] * 0.1
    )

    if total_score > 8.5:
        curr_companies = ["Google", "Microsoft", "Amazon", "Apple", "Meta", "Netflix"]
        exp_companies = ["Google", "Microsoft", "Amazon", "Apple", "Meta", "Netflix"]
    elif total_score > 7.5:
        curr_companies = ["Adobe", "Salesforce", "Atlassian", "Nvidia", "Uber", "LinkedIn"]
        exp_companies = ["Google", "Microsoft", "Amazon", "Apple", "Meta", "Netflix"]
    elif total_score > 6.5:
        curr_companies = ["Zoho", "Freshworks", "Razorpay", "Postman", "Swiggy", "Zomato"]
        exp_companies = ["Adobe", "Salesforce", "Atlassian", "Nvidia", "Uber", "LinkedIn"]
    else:
        curr_companies = exp_companies = ["Infosys", "TCS", "Wipro", "HCL", "Accenture", "Cognizant"]

    curr_companies = np.random.choice(curr_companies, 3, replace = False)
    exp_companies = np.random.choice(exp_companies, 3, replace = False)

    return ' ,'.join(curr_companies), ' ,'.join(exp_companies)

In [22]:
def extraTime(data, curr_sal, exp_sal):
    skill_gap = max(0, (8 - data['programming_skill']) * 10 +
                       (180 - data['leetcode_score']) * 0.1)
    salary_gap = max(0, exp_sal - curr_sal)

    extra_hours = round(1 + (skill_gap/100 * 6) + (salary_gap/10 * 0.5), 1)
    return min(8, max(1, extra_hours))

In [23]:
def report(data, curr_model, exp_model, label_encoder, scaler):
    new_data = pd.DataFrame([data])
    new_data['college_tier_encoded'] = label_encoder.transform(new_data['college_tier'])[0]
    
    
    feature_columns = ['gpa', 'college_tier_encoded', 'problem_solving_skill',
                       'programming_skill', 'leetcode_score', 'project_count',
                       'internship_months', 'skill_score']

    new_data['skill_score'] = (
        new_data['problem_solving_skill'] * 0.4 + new_data['programming_skill'] * 0.3 + new_data['leetcode_score'] / 25 * 0.3
    )

    x_input = new_data[feature_columns]
    x_scaled = scaler.transform(x_input)

    curr_sal = curr_model.predict(x_scaled)[0]
    exp_sal = exp_model.predict(x_scaled)[0]

    curr_salary = max(5, min(50, curr_sal))
    exp_salary = max(curr_sal, min(60, exp_sal))

    curr_companies, exp_companies = predict_companies(data)
    extra_time = extraTime(data, curr_salary, exp_salary)
    return {
        'current_Salary' : round(curr_salary,2),
        'expected_Salary' : round(exp_salary,2),
        'current_Companies' : curr_companies,
        'expected_Companies' : exp_companies,
        'Extra_Time_Required' : extra_time
    }

New student Data for the prediction

In [24]:
input_data = {
    'gpa': 8.8,
    'college_tier': 'IIT',
    'problem_solving_skill': 8,
    'programming_skill': 7,
    'leetcode_score': 190,
    'project_count': 4,
    'internship_months': 6
}

predicted_output = report(
    input_data,
    curr_model,
    exp_model,
    label_encoder,
    scaler
)

print("========== Report for the input features =========\n")
for key, value in predicted_output.items():
    print(f"- {key} : {value}")

print("\n ======== Prediction for the student is Done Successfully =========")


- current_Salary : 16.76
- expected_Salary : 25.01
- current_Companies : LinkedIn ,Salesforce ,Uber
- expected_Companies : Apple ,Netflix ,Google
- Extra_Time_Required : 2.0



Dumping the fields into .pkl files

In [25]:
joblib.dump(curr_model, "Current_model.pkl")
joblib.dump(exp_model, "Expected_model.pkl")
joblib.dump(label_encoder, "Labels.pkl")
joblib.dump(scaler, "Scalers.pkl")

['Scalers.pkl']