# Label Encoding
3 ways ( first 2 buildin library , last 1 manual)
1.Ordinal Encoder
2.Label Encoder
3.Manual Mapping

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [5]:
# Load dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [7]:
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


# Ordinal Encoding

In [8]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [9]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [10]:
# Type casting to list
list(df['gender'].unique())

['Female', 'Male', 'Other']

In [11]:
# Type casting to list
list(df['smoking_history'].unique())

['never', 'No Info', 'current', 'former', 'ever', 'not current']

In [12]:
from sklearn.preprocessing import OrdinalEncoder

# Create an instance of OrdinalEncoder
encoder = OrdinalEncoder(categories=[list(df['gender'].unique()), list(df['smoking_history'].unique())])

In [13]:
 # Select only the categorical columns for encoding
categorical_columns = ['gender', 'smoking_history']
df[categorical_columns] = encoder.fit_transform(df[categorical_columns])

In [14]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0.0,80.0,0,1,0.0,25.19,6.6,140,0
1,0.0,54.0,0,0,1.0,27.32,6.6,80,0
2,1.0,28.0,0,0,0.0,27.32,5.7,158,0
3,0.0,36.0,0,0,2.0,23.45,5.0,155,0
4,1.0,76.0,1,1,2.0,20.14,4.8,155,0


# Label Encoder

In [15]:
# Load dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [16]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [17]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [18]:
gender = encoder.fit_transform(df['gender'])

# Print the encoding values
print("Labels: ",encoder.classes_)
print("Encodings:",encoder.transform(encoder.classes_))

Labels:  ['Female' 'Male' 'Other']
Encodings: [0 1 2]


In [19]:
smoking_history = encoder.fit_transform(df['smoking_history'])

# Print the encoding values
print("Labels: ",encoder.classes_)
print("Encodings:",encoder.transform(encoder.classes_))

Labels:  ['No Info' 'current' 'ever' 'former' 'never' 'not current']
Encodings: [0 1 2 3 4 5]


In [20]:
# Replace encoded lebsl with main df
df['gender'] = gender
df['smoking_history'] = smoking_history

In [23]:
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,0,80.0,0,0,0,27.32,6.2,90,0
99996,0,2.0,0,0,0,17.37,6.5,100,0
99997,1,66.0,0,0,3,27.83,5.7,155,0
99998,0,24.0,0,0,4,35.42,4.0,100,0
99999,0,57.0,0,0,1,22.43,6.6,90,0


# Manual Mapping

In [24]:
# Load dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [25]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [26]:
df['gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [27]:
df['smoking_history'].unique()

array(['never', 'No Info', 'current', 'former', 'ever', 'not current'],
      dtype=object)

In [28]:
# Manual mapping
gender_map = {
    'Female' : 0,
    'Male' : 1,
    'Other' : 2
}
smoking_history_map = {
    'never' : 0,
    'No Info' : 1,
    'current' : 2,
    'former' : 3,
    'ever' : 4,
    'not current': 5
}

In [29]:
# Replace and map
df['gender'] = df['gender'].map(gender_map)
df['smoking_history'] = df['smoking_history'].map(smoking_history_map)

In [30]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,25.19,6.6,140,0
1,0,54.0,0,0,1,27.32,6.6,80,0
2,1,28.0,0,0,0,27.32,5.7,158,0
3,0,36.0,0,0,2,23.45,5.0,155,0
4,1,76.0,1,1,2,20.14,4.8,155,0


In [31]:
df.tail(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99990,1,39.0,0,0,1,27.32,6.1,100,0
99991,1,22.0,0,0,2,29.65,6.0,80,0
99992,0,26.0,0,0,0,34.34,6.5,160,0
99993,0,40.0,0,0,0,40.69,3.5,155,0
99994,0,36.0,0,0,1,24.6,4.8,145,0
99995,0,80.0,0,0,1,27.32,6.2,90,0
99996,0,2.0,0,0,1,17.37,6.5,100,0
99997,1,66.0,0,0,3,27.83,5.7,155,0
99998,0,24.0,0,0,0,35.42,4.0,100,0
99999,0,57.0,0,0,2,22.43,6.6,90,0


# END LABEL ENCODING