In [1]:
import pandas as pd 
df = pd.read_csv('employee_promotion.csv')

In [2]:
df.head()

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion


In [3]:
df.shape

(5000, 9)

## Feature Encoding

In [6]:
from sklearn.preprocessing import LabelEncoder

# copying all the original dataset
df_encoded = df.copy()

# lable encoding for promotion(mostly target variable )
label_encoder = LabelEncoder()
df_encoded["Promotion_Enc"] = label_encoder.fit_transform(df_encoded["Promotion"])

df_encoded 


Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1
...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1


In [10]:
# lable encoding with oreder for Eduction (for Feature having ordinal categories) 
education_encoded_dict = {'High School':0  ,"Bachelor's":1 , "Master's":2 , 'PhD': 3}
df_encoded["Education_Enc"] = df_encoded["Education"].map(education_encoded_dict)
df_encoded 

# here we did not make any object because map() method is inbuilt with pandas 


Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion,Promotion_Enc,Education_Enc
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion,0,0
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion,1,2
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion,0,3
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion,1,0
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion,1,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion,1,0
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion,0,1
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion,0,3
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion,1,1


In [11]:
# one-hot Encoding for City 

df_encoded = pd.get_dummies(df_encoded , columns = ["City"] , prefix= "City")
df_encoded 

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True


In [13]:
# Frequency Encoding for jobTitles 

job_title_counts = df_encoded["JobTitle"].value_counts().to_dict()
df_encoded["JobTitle_Freq_Enc"] = df_encoded["JobTitle"].map(job_title_counts)
df_encoded 

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705


In [17]:
# target Encoding for Education (Encoadig it based on the avg promotion rate )
education_target_mean = df_encoded.groupby("Education")["Promotion_Enc"].mean().to_dict()
df_encoded["Education_Target_Enc"] = df_encoded["Education"].map(education_target_mean)
df_encoded 

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,Promotion,Promotion_Enc,Education_Enc,City_Austin,City_Chicago,City_New York,City_San Francisco,City_Seattle,JobTitle_Freq_Enc,Education_Target_Enc
0,28,132612,9,2,1,Data Engineer,High School,Got Promotion,0,0,False,False,False,False,True,699,0.697692
1,41,116641,8,4,3,Data Scientist,Master's,No Promotion,1,2,False,True,False,False,False,705,0.718826
2,36,113811,11,1,6,Web developer,PhD,Got Promotion,0,3,False,True,False,False,False,733,0.702893
3,32,102160,2,2,6,Data Engineer,High School,No Promotion,1,0,True,False,False,False,False,699,0.697692
4,29,101313,7,4,7,Product Manager,Bachelor's,No Promotion,1,1,False,False,False,True,False,750,0.718593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,No Promotion,1,0,True,False,False,False,False,705,0.697692
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Got Promotion,0,1,False,True,False,False,False,705,0.718593
4997,34,81007,4,0,4,Web developer,PhD,Got Promotion,0,3,True,False,False,False,False,733,0.702893
4998,41,106947,12,2,10,Data Scientist,Bachelor's,No Promotion,1,1,False,False,False,False,True,705,0.718593


In [18]:
df

Unnamed: 0,Age,Salary,YearsExperience,NumCertifications,JobSatisfaction,JobTitle,Education,City,Promotion
0,28,132612,9,2,1,Data Engineer,High School,Seattle,Got Promotion
1,41,116641,8,4,3,Data Scientist,Master's,Chicago,No Promotion
2,36,113811,11,1,6,Web developer,PhD,Chicago,Got Promotion
3,32,102160,2,2,6,Data Engineer,High School,Austin,No Promotion
4,29,101313,7,4,7,Product Manager,Bachelor's,San Francisco,No Promotion
...,...,...,...,...,...,...,...,...,...
4995,38,127658,3,1,10,Data Scientist,High School,Austin,No Promotion
4996,39,84840,9,3,4,Data Scientist,Bachelor's,Chicago,Got Promotion
4997,34,81007,4,0,4,Web developer,PhD,Austin,Got Promotion
4998,41,106947,12,2,10,Data Scientist,Bachelor's,Seattle,No Promotion


## feature Scaling 

In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Copying the dataset
df_scaled = df_encoded.copy()

# Standard Scaler
standard_scaler = StandardScaler()
df_scaled["Salary_StandardScaled"] = standard_scaler.fit_transform(df_scaled[["Salary"]])

# Min-Max Scaler
minmax_scaler = MinMaxScaler()
df_scaled["Salary_MinMaxScaled"] = minmax_scaler.fit_transform(df_scaled[["Salary"]])

# Displaying the first few rows
df_scaled[["Salary", "Salary_StandardScaled", "Salary_MinMaxScaled"]].head()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
0,132612,1.12006,0.826129
1,116641,0.572455,0.666363
2,113811,0.475421,0.638053
3,102160,0.075938,0.521503
4,101313,0.046897,0.51303


In [21]:
df_scaled[["Salary", "Salary_StandardScaled", "Salary_MinMaxScaled"]].describe()

Unnamed: 0,Salary,Salary_StandardScaled,Salary_MinMaxScaled
count,5000.0,5000.0,5000.0
mean,99945.2532,-2.192024e-16,0.499347
std,29168.083473,1.0001,0.291783
min,50028.0,-1.711537,0.0
25%,74622.5,-0.8682533,0.246031
50%,99449.0,-0.01701527,0.494383
75%,125889.5,0.8895628,0.758881
max,149993.0,1.716011,1.0


## Feature Extraction

In [22]:
import pandas as pd 
df = pd.read_csv('retail_customer.csv')
df.head()

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,device_type,is_subscribed,feedback_score
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,Mobile,1,2.7
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,Mobile,1,1.6
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,Mobile,0,3.7
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,Mobile,1,1.9
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,Mobile,1,1.4


In [23]:
df.shape

(500, 13)

In [24]:
df.dtypes

customer_id            object
name                   object
age                     int64
gender                 object
signup_date            object
last_purchase_date     object
total_purchases         int64
total_spent           float64
country                object
email                  object
device_type            object
is_subscribed           int64
feedback_score        float64
dtype: object

In [26]:
# bining Age into categories 
bins = [10 , 30 , 50 , 70] #Age ranges 
labels = ["Young" , "Mid" , "Senior"] # Categories 
df["age_group"] = pd.cut(df["age"] , bins = bins , labels = labels)

# Displaying the first few rows 
df[["age" , "age_group"]].head(20)


Unnamed: 0,age,age_group
0,56,Senior
1,69,Senior
2,46,Mid
3,32,Mid
4,60,Senior
5,25,Young
6,38,Mid
7,56,Senior
8,36,Mid
9,40,Mid


In [28]:
# Convert columns to datetime format
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])

# Time-based features
df['signup_year'] = df['signup_date'].dt.year
df['signup_month'] = df['signup_month'] = df['signup_date'].dt.month
df['days_since_signup'] = (pd.Timestamp.today() - df['signup_date']).dt.days
df['days_since_last_purchase'] = (pd.Timestamp.today() - df['last_purchase_date']).dt.days

In [29]:
# Name features
df['first_name'] = df['name'].apply(lambda x: x.split()[0])
df['last_name'] = df['name'].apply(lambda x: x.split()[-1])
df['name_length'] = df['name'].apply(len)

In [31]:
# High spendar flag 
df['is_high_spender'] = (df['total_spent'] > 5000).astype(int)

In [32]:
df

Unnamed: 0,customer_id,name,age,gender,signup_date,last_purchase_date,total_purchases,total_spent,country,email,...,age-group,age_group,signup_year,signup_month,days_since_signup,days_since_last_purchase,first_name,last_name,name_length,is_high_spender
0,5c7825f3-13bb-4843-9cf1-5b7a944def13,Andrew Miller,56,Male,2025-01-01,2024-06-12,18,6143.01,Bangladesh,qwilson@example.org,...,Senior,Senior,2025,1,222,425,Andrew,Miller,13,1
1,ad1cf2e3-be49-4031-9770-c308b2d1292c,Kevin Ramos,69,Other,2025-05-16,2024-10-03,86,9498.14,Eritrea,shall@example.net,...,Senior,Senior,2025,5,87,312,Kevin,Ramos,11,1
2,33194def-ebb0-4d75-8942-9c83d7d579f2,John Smith,46,Other,2025-04-15,2025-04-27,34,4026.83,Russian Federation,leebrian@example.org,...,Mid,Mid,2025,4,118,106,John,Smith,10,0
3,6a1ce340-ec27-482e-a339-e89e2db11a01,Dustin Nolan,32,Male,2023-10-10,2024-10-20,8,9545.46,Serbia,sguzman@example.org,...,Mid,Mid,2023,10,671,295,Dustin,Nolan,12,1
4,3e529d86-76ed-4c85-83cf-c5ab446cb3c0,Amy Johnson,60,Female,2024-02-02,2025-03-03,40,1436.51,Mauritius,griffithsarah@example.org,...,Senior,Senior,2024,2,556,161,Amy,Johnson,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,cae07587-71ec-4ad9-95d3-1733b04039f6,Natalie Clark,65,Male,2024-04-12,2023-09-30,40,3550.88,Christmas Island,coxelizabeth@example.com,...,Senior,Senior,2024,4,486,681,Natalie,Clark,13,0
496,91c6672b-6ace-4367-9e31-8bd1d8fea975,Jennifer Jones,42,Male,2024-07-18,2023-10-23,11,1676.95,Vanuatu,curtisjesse@example.org,...,Mid,Mid,2024,7,389,658,Jennifer,Jones,14,0
497,9521ecfa-af06-4865-9a4b-e70827ba2485,Jeremy Luna,57,Male,2025-04-20,2023-10-27,36,6089.04,Nauru,uwalker@example.org,...,Senior,Senior,2025,4,113,654,Jeremy,Luna,11,1
498,28da13d6-c253-4279-b3aa-e41599598e9f,Leah Williams,62,Female,2022-06-20,2024-08-20,59,3249.30,British Virgin Islands,perryjacob@example.com,...,Senior,Senior,2022,6,1148,356,Leah,Williams,13,0
