**Data Preprocessing**

1. Standardization
2. Label Encoding

**1. Standardization**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
# 1. Load the Data
df = pd.read_csv('/kaggle/input/data-preprocessing/customer_data.csv')
df.head()

Unnamed: 0,Customer_ID,Country,Join_Date,Annual_Income,Age,Spending_Score
0,1001,UK,2023-03-05,82450,64.0,0.5
1,1002,uk,"Jan 15, 2023",72926,61.0,0.03
2,1003,U.S.A.,20-02-2023,84715,,0.91
3,1004,United Kingdom,2023.02.20,97845,54.0,0.26
4,1005,india,2023-03-05,71487,24.0,0.66


In [4]:
# --- Categorical Standardization ---
# Map messy country names to a clean standard format
country_map = {
    'usa': 'USA', 'United States': 'USA', 'U.S.A.': 'USA',
    'uk': 'UK', 'United Kingdom': 'UK',
    'india': 'India', 'IN': 'India', 'india': 'India'
}
df['Country'] = df['Country'].replace(country_map)
# Fill any missing country values with the most common one (Mode)
df['Country'] = df['Country'].fillna(df['Country'].mode()[0])

In [5]:
df['Annual_Income'].std()

14400.478149956285

In [6]:
df['Age'].std()

15.532459880570125

In [7]:
# --- Date Standardization ---
# Convert mixed formats to standard datetime objects
# 'errors=coerce' turns unreadable dates into NaT (Not a Time) instead of crashing
df['Join_Date'] = pd.to_datetime(df['Join_Date'], errors='coerce')

In [8]:
# --- Statistical Standardization (Scaling) ---
# Step A: Impute missing numerical values (Scaler cannot handle NaNs)
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [9]:
# Step B: Apply StandardScaler
scaler = StandardScaler()
df[['Annual_Income_Scaled', 'Age_Scaled']] = scaler.fit_transform(df[['Annual_Income', 'Age']])

In [10]:
# View the clean data
print(df.head())

   Customer_ID Country  Join_Date  Annual_Income        Age  Spending_Score  \
0         1001      UK 2023-03-05          82450  64.000000            0.50   
1         1002      UK        NaT          72926  61.000000            0.03   
2         1003     USA        NaT          84715  40.578947            0.91   
3         1004      UK        NaT          97845  54.000000            0.26   
4         1005   India 2023-03-05          71487  24.000000            0.66   

   Annual_Income_Scaled  Age_Scaled  
0              0.713886    1.589443  
1              0.035338    1.385851  
2              0.875259    0.000000  
3              1.810720    0.910804  
4             -0.067185   -1.125111  


In [11]:
df['Annual_Income_Scaled'].std()

1.025978352085154

In [12]:
df['Age_Scaled'].std()

1.0259783520851542

**Label Encoding**

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [14]:
# 1. Load the Data
data = pd.read_csv('/kaggle/input/data-preprocessing/label_encoding_dataset.csv')
data.head()

Unnamed: 0,EmployeeID,Department,Education_Level,Performance_Rating,Satisfaction_Score
0,101,Sales,High School,Low,Unhappy
1,102,IT,Bachelor's,Average,Neutral
2,103,HR,Master's,High,Happy
3,104,Marketing,PhD,Exceptional,Very Happy
4,105,IT,Bachelor's,High,Happy


In [15]:
# 2. Initialize the encoder
le = LabelEncoder()

# 3. Apply Label Encoding to the categorical columns
columns_to_encode = ['Education_Level', 'Performance_Rating', 'Satisfaction_Score']

In [16]:
for col in columns_to_encode:
    # This replaces the text values with numbers in the same column
    data[col] = le.fit_transform(data[col])

In [17]:
print(data)

   EmployeeID Department  Education_Level  Performance_Rating  \
0         101      Sales                1                   3   
1         102         IT                0                   0   
2         103         HR                2                   2   
3         104  Marketing                3                   1   
4         105         IT                0                   2   
5         106      Sales                1                   3   
6         107         HR                2                   0   
7         108         IT                3                   1   
8         109  Marketing                0                   3   
9         110      Sales                2                   2   

   Satisfaction_Score  
0                   2  
1                   1  
2                   0  
3                   3  
4                   0  
5                   2  
6                   1  
7                   3  
8                   2  
9                   0  
