<a href="https://colab.research.google.com/github/Vishal-3600/Leader/blob/main/featureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
df=pd.read_csv('employee_promotion_data.csv')

In [None]:
df.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,50,32717,20,8,3,HR Manager,Bachelor,Hyderabad,0
1,36,89676,14,9,4,Data Analyst,PhD,Chennai,0
2,29,57952,2,1,3,Data Analyst,High School,Hyderabad,0
3,42,116444,9,2,5,Project Manager,Bachelor,Bangalore,1
4,40,72460,20,9,3,Project Manager,High School,Delhi,1


In [None]:
df.shape

(1000, 9)

# **Feature Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

#Copying the original dataset
df_encoded=df.copy()

#Label Encoding for promotion (mostly for target variable)
label_encoded=LabelEncoder()
df_encoded["Promotion_Enc"]=label_encoded.fit_transform(df_encoded["Promotion"])
df_encoded.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc
0,50,32717,20,8,3,HR Manager,Bachelor,Hyderabad,0,0
1,36,89676,14,9,4,Data Analyst,PhD,Chennai,0,0
2,29,57952,2,1,3,Data Analyst,High School,Hyderabad,0,0
3,42,116444,9,2,5,Project Manager,Bachelor,Bangalore,1,1
4,40,72460,20,9,3,Project Manager,High School,Delhi,1,1


In [None]:
#label encoding with order for education (for feature having ordinal categories)
education_encoded={"High School":0,"Bachelor":1,"Master":2,"PhD":3}
df_encoded["Education_Enc"]=df_encoded["Education"].map(education_encoded)
df_encoded.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc,Education_Enc
0,50,32717,20,8,3,HR Manager,Bachelor,Hyderabad,0,0,1
1,36,89676,14,9,4,Data Analyst,PhD,Chennai,0,0,3
2,29,57952,2,1,3,Data Analyst,High School,Hyderabad,0,0,0
3,42,116444,9,2,5,Project Manager,Bachelor,Bangalore,1,1,1
4,40,72460,20,9,3,Project Manager,High School,Delhi,1,1,0


In [None]:
# One-hot Encoding for City
df_encoded=pd.get_dummies(df_encoded,columns=["City"],prefix="City")
df_encoded.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Bangalore,City_Chennai,City_Delhi,City_Hyderabad,City_Mumbai,City_Pune
0,50,32717,20,8,3,HR Manager,Bachelor,0,0,1,False,False,False,True,False,False
1,36,89676,14,9,4,Data Analyst,PhD,0,0,3,False,True,False,False,False,False
2,29,57952,2,1,3,Data Analyst,High School,0,0,0,False,False,False,True,False,False
3,42,116444,9,2,5,Project Manager,Bachelor,1,1,1,True,False,False,False,False,False
4,40,72460,20,9,3,Project Manager,High School,1,1,0,False,False,True,False,False,False


In [None]:
#Frequency Encoding for JobTitle
job_title_freq=df_encoded["JobTitle"].value_counts(normalize=True)*1000
df_encoded["JobTitle_Enc"]=df_encoded["JobTitle"].map(job_title_freq)
df_encoded.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Bangalore,City_Chennai,City_Delhi,City_Hyderabad,City_Mumbai,City_Pune,JobTitle_Enc
0,50,32717,20,8,3,HR Manager,Bachelor,0,0,1,False,False,False,True,False,False,208.0
1,36,89676,14,9,4,Data Analyst,PhD,0,0,3,False,True,False,False,False,False,199.0
2,29,57952,2,1,3,Data Analyst,High School,0,0,0,False,False,False,True,False,False,199.0
3,42,116444,9,2,5,Project Manager,Bachelor,1,1,1,True,False,False,False,False,False,206.0
4,40,72460,20,9,3,Project Manager,High School,1,1,0,False,False,True,False,False,False,206.0


In [None]:
#Target Encoding for Education (Encoding it based on the average promotion rate)
education_target_mean=df_encoded.groupby("Education")["Promotion_Enc"].mean().to_dict()
df_encoded["Education_Target_Enc"]=df_encoded["Education"].map(education_target_mean)
df_encoded.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Bangalore,City_Chennai,City_Delhi,City_Hyderabad,City_Mumbai,City_Pune,JobTitle_Enc,Education_Target_Enc
0,50,32717,20,8,3,HR Manager,Bachelor,0,0,1,False,False,False,True,False,False,208.0,0.509158
1,36,89676,14,9,4,Data Analyst,PhD,0,0,3,False,True,False,False,False,False,199.0,0.549407
2,29,57952,2,1,3,Data Analyst,High School,0,0,0,False,False,False,True,False,False,199.0,0.522267
3,42,116444,9,2,5,Project Manager,Bachelor,1,1,1,True,False,False,False,False,False,206.0,0.509158
4,40,72460,20,9,3,Project Manager,High School,1,1,0,False,False,True,False,False,False,206.0,0.522267


In [None]:
#scaling the salary
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#copying the dataset
df_scaled =df_encoded.copy()

#Standard Scaler
standard_scaler=StandardScaler()
df_scaled["Salary_StandardScaled"]=standard_scaler.fit_transform(df_scaled[["Salary"]])

#Min-max Scaler
minmax_scaler=MinMaxScaler()
df_scaled["Salary_MinMaxScaled"]=minmax_scaler.fit_transform(df_scaled[["Salary"]])

#Displaying the first few rows
df_scaled[["Salary","Salary_StandardScaled","Salary_MinMaxScaled"]].head()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
0,32717,-1.680658,0.022195
1,89676,0.001623,0.497999
2,57952,-0.935344,0.232994
3,116444,0.792215,0.721605
4,72460,-0.506851,0.354186


In [None]:
df_scaled[["Salary","Salary_StandardScaled","Salary_MinMaxScaled"]].describe()



Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
count,1000.0,1000.0,1000.0
mean,89621.044,1.54543e-16,0.49754
std,33875.131043,1.0005,0.282974
min,30060.0,-1.759133,0.0
25%,59758.5,-0.8819888,0.248085
50%,88329.5,-0.03814569,0.486751
75%,117161.0,0.8133913,0.727594
max,149771.0,1.776526,1.0


In [3]:
import pandas as pd
df=pd.read_csv('/retail_customers_10000.csv')

In [4]:
display(df.head())

Unnamed: 0,Customer ID,Name,Age,Gender,Signup Date,Last Purchase,Total Purchases,Total Spent ($),Country,Email,Device Type,Is Subscribed,Feedback Score
0,1,Allison Hill,58,Male,2021-07-12,2022-02-13,2,3720.67,Germany,garzaanthony@robinson.org,Mobile,Yes,8.2
1,2,Cristian Santos,65,Male,2022-11-24,2024-09-05,44,3716.31,China,zlawrence@blake.biz,Mobile,No,7.3
2,3,Christopher Bernard,20,Male,2022-01-04,2022-06-11,6,1132.26,China,wdavis@baker.com,Tablet,Yes,8.5
3,4,Jamie Arnold,53,Male,2024-02-09,2025-06-11,46,3266.93,China,janetwilliams@gmail.com,Desktop,Yes,9.8
4,5,Victoria Wyatt,46,Female,2024-04-07,2024-06-24,1,3806.1,UK,shawn52@yahoo.com,Tablet,No,7.1


In [6]:
#Binning Age into categories
bins=[10,30,50,70,90] #Age ranges
labels=["young","Mid","old","Super Old"]
df["age_group"]=pd.cut(df["Age"],bins=bins,labels=labels)

#Displaying the first few rows
df[["Age","age_group"]].head(20)

Unnamed: 0,Age,age_group
0,58,old
1,65,old
2,20,young
3,53,old
4,46,Mid
5,39,Mid
6,42,Mid
7,64,old
8,58,old
9,67,old


In [10]:
#converting date columns to datetime
df['Signup Date']=pd.to_datetime(df['Signup Date'])
df['Last Purchase']=pd.to_datetime(df['Last Purchase'])


In [11]:
df.dtypes


Unnamed: 0,0
Customer ID,int64
Name,object
Age,int64
Gender,object
Signup Date,datetime64[ns]
Last Purchase,datetime64[ns]
Total Purchases,int64
Total Spent ($),float64
Country,object
Email,object


In [15]:
#Time Based features
df['signup_year']=df['Signup Date'].dt.year
df['signup_month']=df['Signup Date'].dt.month

df['days_since_signup']=(pd.Timestamp.today()-df['Signup Date']).dt.days
df['days_since_last_purchase']=(pd.Timestamp.today()-df['Last Purchase']).dt.days

In [16]:
df.head(20)

Unnamed: 0,Customer ID,Name,Age,Gender,Signup Date,Last Purchase,Total Purchases,Total Spent ($),Country,Email,Device Type,Is Subscribed,Feedback Score,age_group,signup_year,signup_month,days_since_signup,days_since_last_purchase
0,1,Allison Hill,58,Male,2021-07-12,2022-02-13,2,3720.67,Germany,garzaanthony@robinson.org,Mobile,Yes,8.2,old,2021,7,1481,1265
1,2,Cristian Santos,65,Male,2022-11-24,2024-09-05,44,3716.31,China,zlawrence@blake.biz,Mobile,No,7.3,old,2022,11,981,330
2,3,Christopher Bernard,20,Male,2022-01-04,2022-06-11,6,1132.26,China,wdavis@baker.com,Tablet,Yes,8.5,young,2022,1,1305,1147
3,4,Jamie Arnold,53,Male,2024-02-09,2025-06-11,46,3266.93,China,janetwilliams@gmail.com,Desktop,Yes,9.8,old,2024,2,539,51
4,5,Victoria Wyatt,46,Female,2024-04-07,2024-06-24,1,3806.1,UK,shawn52@yahoo.com,Tablet,No,7.1,Mid,2024,4,481,403
5,6,Michael Miles,39,Female,2023-06-27,2024-11-11,10,1115.8,UAE,amandasanchez@gray-mayo.net,Mobile,Yes,7.1,Mid,2023,6,766,263
6,7,Frederick Tate,42,Male,2020-10-25,2022-10-06,23,4245.1,Brazil,richard13@walter.biz,Desktop,Yes,9.9,Mid,2020,10,1741,1030
7,8,Maria Thomas,64,Female,2021-03-04,2023-04-09,35,667.89,Australia,zhurst@yahoo.com,Mobile,No,8.7,old,2021,3,1611,845
8,9,Zachary Hicks,58,Female,2020-12-19,2021-05-14,37,1001.83,India,courtneyconner@carlson-cruz.org,Mobile,Yes,6.8,old,2020,12,1686,1540
9,10,Kendra Maddox DVM,67,Female,2020-08-17,2021-08-07,6,4283.82,India,daniel62@yahoo.com,Desktop,No,8.3,old,2020,8,1810,1455


In [17]:
#Name features
df['first_name']=df['Name'].apply(lambda x:x.split()[0])
df['last_name']=df['Name'].apply(lambda x:x.split()[-1])
df['name_length']=df['Name'].apply(lambda x:len(x))

In [20]:
df['is_high_spender']=(df['Total Spent ($)']>5000).astype(int)

In [21]:
df

Unnamed: 0,Customer ID,Name,Age,Gender,Signup Date,Last Purchase,Total Purchases,Total Spent ($),Country,Email,...,age_group,signup_year,signup_month,days_since_signup,days_since_last_purchase,first_name,last_name,name_length,is_high_splender,is_high_spender
0,1,Allison Hill,58,Male,2021-07-12,2022-02-13,2,3720.67,Germany,garzaanthony@robinson.org,...,old,2021,7,1481,1265,Allison,Hill,12,0,0
1,2,Cristian Santos,65,Male,2022-11-24,2024-09-05,44,3716.31,China,zlawrence@blake.biz,...,old,2022,11,981,330,Cristian,Santos,15,0,0
2,3,Christopher Bernard,20,Male,2022-01-04,2022-06-11,6,1132.26,China,wdavis@baker.com,...,young,2022,1,1305,1147,Christopher,Bernard,19,0,0
3,4,Jamie Arnold,53,Male,2024-02-09,2025-06-11,46,3266.93,China,janetwilliams@gmail.com,...,old,2024,2,539,51,Jamie,Arnold,12,0,0
4,5,Victoria Wyatt,46,Female,2024-04-07,2024-06-24,1,3806.10,UK,shawn52@yahoo.com,...,Mid,2024,4,481,403,Victoria,Wyatt,14,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,Beth Brooks,21,Female,2024-01-01,2024-06-18,20,1036.24,Brazil,vthompson@gmail.com,...,young,2024,1,578,409,Beth,Brooks,11,0,0
9996,9997,Brittany King,42,Female,2020-11-19,2021-04-16,31,3299.25,China,nbarber@thompson-johnson.biz,...,Mid,2020,11,1716,1568,Brittany,King,13,0,0
9997,9998,Craig Grant,55,Female,2022-11-18,2024-11-15,14,4293.94,UK,colonbobby@summers-mullins.com,...,old,2022,11,987,259,Craig,Grant,11,0,0
9998,9999,Christine Wilson,23,Female,2024-03-19,2025-05-26,50,1199.26,UAE,ramirezdonald@gmail.com,...,young,2024,3,500,67,Christine,Wilson,16,0,0
