In [2]:
# data handling and visualization libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data preparation libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_regression

# model
from sklearn.ensemble import GradientBoostingRegressor

# evaluation
from sklearn import metrics

# Pipeline and transformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# random seed
np.random.seed(1111)

# loading the dataset
df=pd.read_csv('/content/drive/MyDrive/adult 3.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [9]:
# Encode the target variable
label_encoder = LabelEncoder()
df['income'] = label_encoder.fit_transform(df['income'])

X = df.drop('income', axis=1) # X contains all features EXCEPT 'income'
y = df['income']

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((34189, 14), (14653, 14), (34189,), (14653,))

In [12]:
# preprocessing transformer
ordinal_columns = ['occupation', 'workclass', 'marital-status', 'relationship', 'race', 'native-country'] # REMOVED 'income'
ordinal_columns_no_order = ['gender', 'education']
trans1 = ColumnTransformer(
                  [('ordinal-encoding', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_columns), # Using one OrdinalEncoder for all categorical features
                   ('ordinal-encoding-no-order', OrdinalEncoder(), ordinal_columns_no_order)], # Keep 'gender' and 'education' separate if needed, or combine
                   remainder='passthrough')

trans2 = SelectKBest(f_regression, k=5)

trans3 = GradientBoostingRegressor()

In [13]:
# creating a pipeline
pipeline = Pipeline(steps=[('preprocessing', trans1),
                           ('feature_selection', trans2),
                           ('model', trans3)])

In [14]:
pipeline.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
# parameters for hyperparameter tuning

param_dist = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'model__max_depth': [2, 3, 4, 5],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__subsample': [0.6, 0.8, 1.0]
}


In [19]:
# hyperparameter tuning using RandomizedSearchCV
random_search= RandomizedSearchCV(pipeline, param_dist, n_iter=100, cv=3, scoring= 'r2', n_jobs=-1)
random_search.fit(x_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [20]:
print('Best Parameters: ', random_search.best_params_)
print('Best Score: ', random_search.best_score_)

Best Parameters:  {'model__subsample': 1.0, 'model__n_estimators': 200, 'model__min_samples_split': 10, 'model__min_samples_leaf': 2, 'model__max_depth': 5, 'model__learning_rate': 0.05}
Best Score:  0.4538080486940985


In [21]:
# extracting the SelectKBest feature selection step from the best model
best_model = random_search.best_estimator_
select_k_best = best_model.named_steps['feature_selection']

In [22]:
# extracting the names of the best features selected through SelectKBest
mask = select_k_best.get_support()

feature_names = x_train.columns
selected_features = feature_names[mask]

In [23]:
# updating training and test datasets with only the selected best features
x_train_new = x_train[selected_features]
x_test_new = x_test[selected_features]

In [24]:
x_train_new.head()

Unnamed: 0,education,race,capital-gain,capital-loss,native-country
11785,HS-grad,White,0,0,United-States
26590,Assoc-voc,White,0,0,United-States
36997,5th-6th,White,0,0,?
21572,Masters,White,0,0,United-States
29566,HS-grad,White,6849,0,United-States


In [25]:
# creating the final pipeline
final_pipeline = Pipeline(steps=[('preprocessing', trans1),
                                 ('feature_selection', trans2), # Include SelectKBest in the final pipeline
                                 ('model', GradientBoostingRegressor())])
# extracting best paramteres
best_params= random_search.best_params_

# updating pipeline with the best parameters
final_pipeline.set_params(**best_params)

In [26]:
x_train_new

Unnamed: 0,education,race,capital-gain,capital-loss,native-country
11785,HS-grad,White,0,0,United-States
26590,Assoc-voc,White,0,0,United-States
36997,5th-6th,White,0,0,?
21572,Masters,White,0,0,United-States
29566,HS-grad,White,6849,0,United-States
...,...,...,...,...,...
25235,Assoc-voc,White,0,0,United-States
39203,Assoc-acdm,White,0,0,Mexico
42163,Some-college,Black,0,0,United-States
11700,HS-grad,White,0,0,United-States


In [27]:
final_pipeline.fit(x_train, y_train) # Fit with original x_train

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [28]:
import pickle
pickle.dump(final_pipeline, open('final-pipeline-salary-prediction.pkl', 'wb'))

In [29]:
# test_input for pipeline testing in the next notebook
num=np.random.randint(1000)
a = x_train_new.iloc[num,:]# random number for index value
{x : [value] for x, value in a.items()}

{'education': ['HS-grad'],
 'race': ['White'],
 'capital-gain': [np.int64(3781)],
 'capital-loss': [np.int64(0)],
 'native-country': ['United-States']}

In [30]:
y_train.iloc[num]

np.int64(0)

In [31]:
import streamlit as st
import pickle
import numpy as np
import pandas as pd

# load trained model
pipe=pickle.load(open('/content/final-pipeline-salary-prediction.pkl', 'rb'))

# app title
st.title('Salary Prediction App')


# getting user input
sex = st.selectbox('Gender', ['F', 'M'])
designation = st.selectbox('Designation', ['Analyst', 'Associate', 'Senior Analyst', 'Manager', 'Senior Manager', 'Director'])
unit = st.selectbox('Unit', ['Marketing', 'Finance', 'Web', 'IT', 'Operations'])
past_exp= st.number_input('Past Experince (in years)', min_value=0, step=1)
years_experience = st.number_input('Experience in current company (in years)', min_value=0, step=1)

# predict button
if st.button('Predict Salary!'):
    input_data = pd.DataFrame({'SEX' : [sex],
                              'DESIGNATION' : [designation],
                              'UNIT' : [unit],
                              'PAST EXP' : [past_exp],
                              'years_experience' : [years_experience]})


    prediction = pipe.predict(input_data)

    st.write(f'Predicted Salary: ${prediction[0]:.2f}')

ModuleNotFoundError: No module named 'streamlit'

In [22]:
%pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hI