<center><h2></b>Feature engineering</b></h2></center>

In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
pd.pandas.set_option('display.max_columns', None)

In [196]:
df = pd.read_csv("Salary Prediction of Data Professions.csv")

In [197]:
df.columns

Index(['FIRST NAME', 'LAST NAME', 'SEX', 'DOJ', 'CURRENT DATE', 'DESIGNATION',
       'AGE', 'SALARY', 'UNIT', 'LEAVES USED', 'LEAVES REMAINING', 'RATINGS',
       'PAST EXP'],
      dtype='object')

In [198]:
# Changing the column names to lower case
feature_list = [feature.lower() for feature in df.columns]
feature_list

['first name',
 'last name',
 'sex',
 'doj',
 'current date',
 'designation',
 'age',
 'salary',
 'unit',
 'leaves used',
 'leaves remaining',
 'ratings',
 'past exp']

In [199]:
df.columns = feature_list 

In [200]:
# dropping the first and last name variables
df = df.drop(['first name', 'last name'], axis = 1)

In [201]:
df

Unnamed: 0,sex,doj,current date,designation,age,salary,unit,leaves used,leaves remaining,ratings,past exp
0,F,5-18-2014,01-07-2016,Analyst,21.0,44570,Finance,24.0,6.0,2.0,0
1,F,,01-07-2016,Associate,,89207,Web,,13.0,,7
2,F,7-28-2014,01-07-2016,Analyst,21.0,40955,Finance,23.0,7.0,3.0,0
3,F,04-03-2013,01-07-2016,Analyst,22.0,45550,IT,22.0,8.0,3.0,0
4,M,11-20-2014,01-07-2016,Analyst,,43161,Operations,27.0,3.0,,3
...,...,...,...,...,...,...,...,...,...,...,...
2634,F,6-28-2011,01-07-2016,Senior Manager,36.0,185977,Management,15.0,15.0,5.0,10
2635,F,1-14-2014,01-07-2016,Analyst,23.0,45758,IT,17.0,13.0,2.0,0
2636,F,1-23-2014,01-07-2016,Analyst,21.0,47315,Web,29.0,1.0,5.0,0
2637,F,3-17-2014,01-07-2016,Analyst,24.0,45172,Web,23.0,7.0,3.0,1


### Since we have only one doj missing value, dropping it will not cause severe loss of information  

In [202]:
df.drop(np.where(df["doj"].isna())[0], axis = 0,  inplace = True)

In [203]:
# Predictors and dependent variable
y = df['salary']
X = df.drop('salary', axis = 1)

In [204]:
# Training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [205]:
X_train.shape, X_test.shape

((2110, 10), (528, 10))

In [206]:
X_train.isna().sum()

sex                 0
doj                 0
current date        0
designation         0
age                 2
unit                0
leaves used         2
leaves remaining    2
ratings             1
past exp            0
dtype: int64

In [207]:
cat_features_with_nan = [feature for feature in X_train.columns if X_train[feature].isna().sum() > 0 and X_train[feature].dtypes == 'O' and feature not in ['doj', 'currebt date']] 
cat_features_with_nan

[]

In [208]:
num_features_with_nan = [feature for feature in X_train.columns if X_train[feature].isna().sum() > 0 and X_train[feature].dtypes != 'O' and feature not in ['doj', 'currebt date']] 
num_features_with_nan

['age', 'leaves used', 'leaves remaining', 'ratings']

In [209]:
for feature in num_features_with_nan:
    print(f"{feature} has {np.round(X_train[feature].isna().mean() * 100,4)} % missing values")

age has 0.0948 % missing values
leaves used has 0.0948 % missing values
leaves remaining has 0.0948 % missing values
ratings has 0.0474 % missing values


In [210]:
# Replacing numerical features missing values with median value
for feature in num_features_with_nan:
    median_val = X_train[feature].median()
    X_train[feature].fillna(median_val,inplace=True)

In [211]:
X_train[num_features_with_nan].isna().sum()

age                 0
leaves used         0
leaves remaining    0
ratings             0
dtype: int64

In [212]:
datetime_features_with_nan = [feature for feature in X_train.columns if X_train[feature].isna().sum() > 0 and feature not in num_features_with_nan and feature not in cat_features_with_nan]
datetime_features_with_nan

[]

In [213]:
X_train.isna().sum()

sex                 0
doj                 0
current date        0
designation         0
age                 0
unit                0
leaves used         0
leaves remaining    0
ratings             0
past exp            0
dtype: int64

### Creating new feature

In [214]:
X_train["overall_experience"] = (pd.to_datetime(X_train["current date"]) - pd.to_datetime(X_train["doj"])).dt.days + X_train["past exp"] * 365

In [215]:
X_train.head()

Unnamed: 0,sex,doj,current date,designation,age,unit,leaves used,leaves remaining,ratings,past exp,overall_experience
2395,F,1-17-2013,01-07-2016,Analyst,22.0,Management,18.0,12.0,2.0,0,1085
441,M,10-24-2014,01-07-2016,Analyst,21.0,IT,22.0,8.0,5.0,0,440
509,M,06-02-2014,01-07-2016,Analyst,25.0,IT,18.0,12.0,5.0,2,1314
77,M,06-03-2013,01-07-2016,Analyst,22.0,Marketing,21.0,9.0,2.0,0,948
523,F,01-06-2012,01-07-2016,Senior Manager,35.0,Marketing,29.0,1.0,4.0,10,5112


In [216]:
X_train.drop(["doj", "current date", "past exp"], axis = 1, inplace = True)

In [217]:
# categorical feature encoding
designation_encoding = {"Analyst" : 1, "Senior Analyst" : 2, "Associate" : 3, "Manager" : 4, "Senior Manager" : 5, "Director" : 6}
sex_encoding = {"F": 0, "M" : 1}
unit_encoding = {"Marketing" : 1, "Finance" : 2, "Web" : 3, "Management" : 4, "Operations" : 5, "IT" : 6}

In [218]:
X_train["designation"] = X_train["designation"].apply(lambda x: designation_encoding[x])
X_train["sex"] = X_train['sex'].apply(lambda x: sex_encoding[x])
X_train["unit"] = X_train["unit"].apply(lambda x: unit_encoding[x])

In [219]:
X_train.head()

Unnamed: 0,sex,designation,age,unit,leaves used,leaves remaining,ratings,overall_experience
2395,0,1,22.0,4,18.0,12.0,2.0,1085
441,1,1,21.0,6,22.0,8.0,5.0,440
509,1,1,25.0,6,18.0,12.0,5.0,1314
77,1,1,22.0,1,21.0,9.0,2.0,948
523,0,5,35.0,1,29.0,1.0,4.0,5112


In [220]:
# scaling the data
scaler = MinMaxScaler()
scaler.fit(X_train)

In [221]:
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)

In [222]:
X_train

Unnamed: 0,sex,designation,age,unit,leaves used,leaves remaining,ratings,overall_experience
0,0.0,0.0,0.041667,0.6,0.200000,0.800000,0.000000,0.088058
1,1.0,0.0,0.000000,1.0,0.466667,0.533333,1.000000,0.011717
2,1.0,0.0,0.166667,1.0,0.200000,0.800000,1.000000,0.115162
3,1.0,0.0,0.041667,0.0,0.400000,0.600000,0.000000,0.071843
4,0.0,0.8,0.583333,0.0,0.933333,0.066667,0.666667,0.564682
...,...,...,...,...,...,...,...,...
2105,1.0,0.2,0.291667,0.2,0.733333,0.266667,0.333333,0.198367
2106,0.0,0.2,0.208333,1.0,0.466667,0.533333,0.333333,0.071251
2107,1.0,0.0,0.166667,1.0,0.000000,1.000000,1.000000,0.114215
2108,0.0,0.0,0.083333,1.0,0.200000,0.800000,0.000000,0.046514


### Doing all the feature engineering for test data

In [223]:
for feature in num_features_with_nan:
    median_val = X_test[feature].median()
    X_test[feature].fillna(median_val,inplace=True)

In [224]:
X_test["overall_experience"] = (pd.to_datetime(X_test["current date"]) - pd.to_datetime(X_test["doj"])).dt.days + X_test["past exp"] * 365

In [225]:
X_test.drop(["doj", "current date", "past exp"], axis = 1, inplace = True)

In [226]:
X_test["designation"] = X_test["designation"].apply(lambda x: designation_encoding[x])
X_test["sex"] = X_test['sex'].apply(lambda x: sex_encoding[x])
X_test["unit"] = X_test["unit"].apply(lambda x: unit_encoding[x])

In [227]:
X_test

Unnamed: 0,sex,designation,age,unit,leaves used,leaves remaining,ratings,overall_experience
1558,0,1,24.0,2,23.0,7.0,4.0,1067
1090,0,1,22.0,1,24.0,6.0,4.0,608
2380,0,3,31.0,2,16.0,14.0,5.0,2633
2601,0,1,24.0,3,27.0,3.0,5.0,1079
742,0,1,24.0,5,29.0,1.0,2.0,1018
...,...,...,...,...,...,...,...,...
2137,0,3,29.0,4,28.0,2.0,4.0,1486
545,1,2,27.0,3,19.0,11.0,4.0,1466
1867,1,4,32.0,3,18.0,12.0,2.0,3127
2154,1,2,26.0,2,28.0,2.0,5.0,843


In [228]:
test_scaler = MinMaxScaler()
test_scaler.fit(X_test)

In [229]:
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [230]:
X_test

Unnamed: 0,sex,designation,age,unit,leaves used,leaves remaining,ratings,overall_experience
0,0.0,0.0,0.125000,0.2,0.533333,0.466667,0.666667,0.085927
1,0.0,0.0,0.041667,0.0,0.600000,0.400000,0.666667,0.031601
2,0.0,0.4,0.416667,0.2,0.066667,0.933333,1.000000,0.271275
3,0.0,0.0,0.125000,0.4,0.800000,0.200000,1.000000,0.087348
4,0.0,0.0,0.125000,0.8,0.933333,0.066667,0.000000,0.080128
...,...,...,...,...,...,...,...,...
523,0.0,0.4,0.333333,0.6,0.866667,0.133333,0.666667,0.135519
524,1.0,0.2,0.250000,0.4,0.266667,0.733333,0.666667,0.133152
525,1.0,0.6,0.458333,0.4,0.200000,0.800000,0.000000,0.329743
526,1.0,0.2,0.208333,0.2,0.866667,0.133333,1.000000,0.059415


<center><h2></b>Featute selection</b></h2></center>

In [231]:
# Using Lasso regularization
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [232]:
feature_select_model = SelectFromModel(Lasso(alpha=0.005, max_iter = 10000, random_state=42))
feature_select_model.fit(X_train, y_train)

In [233]:
feature_select_model.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True])

### All features are important in predicting salary

<center><h2></b>Model Building - Training, Evaluating, Selecting best model</b></h2></center>

In [238]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [235]:
# Custom model trainer and evaluater function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return np.round(mse, 4), np.round(r2, 4)

In [239]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'KNN': KNeighborsRegressor(),
    'Extra Trees': ExtraTreesRegressor(random_state=42)
}

results = {}
# Model Selection
for name, model in models.items():
    mse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {'MSE': mse, 'R2': r2}
    print(f"{name} - MSE: {mse}, R2: {r2}")

print()
results_df = pd.DataFrame(results).T
print(results_df)

Linear Regression - MSE: 167487223.3748, R2: 0.8857
Decision Tree - MSE: 118536522.5455, R2: 0.9191
Random Forest - MSE: 66642584.7048, R2: 0.9545
Gradient Boosting - MSE: 57219883.5092, R2: 0.961
KNN - MSE: 142573415.9407, R2: 0.9027
Extra Trees - MSE: 60183790.4188, R2: 0.9589

                            MSE      R2
Linear Regression  1.674872e+08  0.8857
Decision Tree      1.185365e+08  0.9191
Random Forest      6.664258e+07  0.9545
Gradient Boosting  5.721988e+07  0.9610
KNN                1.425734e+08  0.9027
Extra Trees        6.018379e+07  0.9589


### Gradient Boosting Regression is the best model of all 