In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv("E:/Telegram Bot/Salary.csv")

In [4]:
df.head(10)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0
5,29.0,Male,1,Marketing Analyst,2.0,55000.0,USA,Hispanic,0
6,42.0,Female,2,Product Manager,12.0,120000.0,USA,Asian,0
7,31.0,Male,1,Sales Manager,4.0,80000.0,China,Korean,0
8,26.0,Female,1,Marketing Coordinator,1.0,45000.0,China,Chinese,0
9,38.0,Male,3,Scientist,10.0,110000.0,Australia,Australian,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6684 entries, 0 to 6683
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6684 non-null   float64
 1   Gender               6684 non-null   object 
 2   Education Level      6684 non-null   int64  
 3   Job Title            6684 non-null   object 
 4   Years of Experience  6684 non-null   float64
 5   Salary               6684 non-null   float64
 6   Country              6684 non-null   object 
 7   Race                 6684 non-null   object 
 8   Senior               6684 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 470.1+ KB


In [6]:
df.count()

Age                    6684
Gender                 6684
Education Level        6684
Job Title              6684
Years of Experience    6684
Salary                 6684
Country                6684
Race                   6684
Senior                 6684
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.count()

Age                    6684
Gender                 6684
Education Level        6684
Job Title              6684
Years of Experience    6684
Salary                 6684
Country                6684
Race                   6684
Senior                 6684
dtype: int64

In [9]:
df['Job Title'].unique()

array(['Software Engineer', 'Data Analyst', 'Manager', 'Sales Associate',
       'Director', 'Marketing Analyst', 'Product Manager',
       'Sales Manager', 'Marketing Coordinator', 'Scientist',
       'Software Developer', 'HR Manager', 'Financial Analyst',
       'Project Manager', 'Customer Service Rep', 'Operations Manager',
       'Marketing Manager', 'Engineer', 'Data Entry Clerk',
       'Sales Director', 'Business Analyst', 'VP of Operations',
       'IT Support', 'Recruiter', 'Financial Manager',
       'Social Media Specialist', 'Software Manager', 'Developer',
       'Consultant', 'Product Designer', 'CEO', 'Accountant',
       'Data Scientist', 'Marketing Specialist', 'Technical Writer',
       'HR Generalist', 'Project Engineer', 'Customer Success Rep',
       'Sales Executive', 'UX Designer', 'Operations Director',
       'Network Engineer', 'Administrative Assistant',
       'Strategy Consultant', 'Copywriter', 'Account Manager',
       'Director of Marketing', 'Help Des

In [10]:
X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Country', 'Race', 'Senior']]
y = df['Salary']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [16]:
text_transformer = TfidfVectorizer(max_features=250)  # Limit to top 50 most informative words

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'Job Title'),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Country', 'Race']),
    ],
    remainder='passthrough'  # Keeps numeric features
)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
from sklearn.pipeline import Pipeline
import joblib

In [19]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [20]:
model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [21]:
joblib.dump(model, "salary_prediction_pipeline.pkl")

['salary_prediction_pipeline.pkl']

In [34]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Predict on test data
y_pred = model.predict(X_test)

# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


Mean Absolute Error (MAE): 3819.95
Root Mean Squared Error (RMSE): 8722.88
R² Score: 0.97


In [24]:
def predict_salary_with_pipeline(age, gender, edu_level, job_title, experience, country, race, senior):
    pipeline = joblib.load("salary_prediction_pipeline.pkl")
    
    input_df = pd.DataFrame([{
        'Age': age,
        'Gender': gender,
        'Education Level': edu_level,
        'Job Title': job_title,
        'Years of Experience': experience,
        'Country': country,
        'Race': race,
        'Senior' : senior
    }])
    
    prediction = pipeline.predict(input_df)[0]
    return prediction

In [33]:
predict_salary_with_pipeline(25, "Male", 1, "Software Engineer", 1, "UK", "White", 0)

55220.0