# Feature Engineering

In [27]:
import pandas as pd

In [28]:
final_df = pd.read_csv('../Dataset/final_df.csv')
final_df.head()

Unnamed: 0,Respondent ID,Gender,Age,BMI,Chromium [nmol/L],Cobalt [nmol/L],Ferritin [ng/mL],Iron frozen [umol/L],UIBC [umol/L],Total Iron Binding Capacity [umol/L],...,Cancer,Cancer_Type,Anemia,Heart_Attack,Hepatitis C,Gestational diabetes,Diabetes,Weak/Failing kidneys,Hepatitis A antibody,Hepatitis B core antibody
0,93703.0,0.0,2.0,17.5,6.787297,3.653375,133.394928,15.448058,43.34484,58.796572,...,,,2.0,,,,2.0,,,
1,93704.0,1.0,2.0,15.7,6.787297,3.653375,36.6,15.448058,43.34484,58.796572,...,,,2.0,,,,2.0,,1.0,
2,93705.0,0.0,66.0,31.7,5.58,3.39,28.7,16.5,41.73,58.21,...,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0
3,93706.0,1.0,18.0,21.5,6.787297,3.653375,284.0,29.4,27.76,57.13,...,,,2.0,,2.0,,2.0,,2.0,2.0
4,93707.0,1.0,13.0,18.1,6.787297,3.653375,49.3,16.3,60.89,77.19,...,,,2.0,,2.0,,2.0,,2.0,2.0


# Feature Engineering

Feature engineering is the art of transforming raw data into meaningful and informative features that empower machine learning algorithms to perform better. It's an iterative process that draws from domain knowledge and statistical insights to extract the most relevant information from your dataset, ultimately enhancing model accuracy and predictive power.

## A/G Ratio

In [29]:
# We can do some feature engineering by creating new features from existing ones

# One such feature is the Albumin Globulin (A/G) Ratio
# A/G Ratio = Albumin in g/dL / (Total Protein in g/dL – Albumin in g/dL)
# Total Protein in g/dL – Albumin in g/dL = Globulin in g/dL
# This ratio is a measure of liver and kidney function

final_df['A/G Ratio'] = final_df['Albumin [g/dL]']/final_df['Globulin [g/dL]']
final_df['A/G Ratio']

0       1.321159
1       1.321159
2       1.517241
3       1.629630
4       1.857143
          ...   
8892    1.846154
8893    1.142857
8894    1.500000
8895    1.205882
8896    1.482759
Name: A/G Ratio, Length: 8897, dtype: float64

In [30]:
final_df['A/G Ratio'].describe()

count    8897.000000
mean        1.341679
std         0.202990
min         0.403846
25%         1.266667
50%         1.321159
75%         1.419355
max         2.571429
Name: A/G Ratio, dtype: float64

## Cholestrol Ratios

### Non-HDL Ratio

In [31]:
# Cholesterol ratios can help calculate a person’s risk of stroke and heart attack due to cholesterol.

# Non-HDL cholesterol level may be a better risk predictor than your total cholesterol level or even your LDL
# We calculate the ratio after converting mmol/L to mg/dL
# We do the conversion by dividing it with 0.02586 (which is given in the NHANES documentation)

final_df['Non_HDL Ratio'] = (final_df['Total Cholesterol [mmol/L]']/0.02586) - (final_df['Direct HDL-Cholesterol [mmol/L]']/0.02586)
final_df['Non_HDL Ratio']

0       126.497004
1       126.497004
2        97.061098
3       100.928074
4       121.036350
           ...    
8892     59.164733
8893    133.023975
8894    117.942769
8895    116.009281
8896    129.156999
Name: Non_HDL Ratio, Length: 8897, dtype: float64

In [32]:
final_df['Non_HDL Ratio'].describe()

count    8897.000000
mean      126.497004
std        35.309817
min        22.041763
25%       104.795050
50%       126.497004
75%       139.984532
max       408.352668
Name: Non_HDL Ratio, dtype: float64

### LDL-HDL ratio

In [33]:
# LDL-HDL ratio = LDL ÷ HDL
# It is one of the measures to see a person’s risk of heart disease

final_df['LDL_HDL Ratio'] = final_df['LDL-Cholesterol(NIH2) [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['LDL_HDL Ratio']

0       2.042228
1       2.042228
2       1.819441
3       2.311585
4       1.602348
          ...   
8892    0.717419
8893    2.220577
8894    1.995000
8895    3.204697
8896    2.468182
Name: LDL_HDL Ratio, Length: 8897, dtype: float64

In [34]:
final_df['LDL_HDL Ratio'].describe()

count    8897.000000
mean        2.156743
std         0.686117
min         0.205682
25%         1.819441
50%         2.042228
75%         2.427607
max        21.517857
Name: LDL_HDL Ratio, dtype: float64

### Total cholesterol HDL ratio

In [35]:
# Total cholesterol HDL ratio = total cholesterol ÷ HDL
# Higher ratios (>5) mean a higher risk of heart disease

final_df['Total Cholestrol_HDL Ratio'] = final_df['Total Cholesterol [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['Total Cholestrol_HDL Ratio']

0       3.368882
1       3.368882
2       2.619355
3       3.139344
4       2.778409
          ...   
8892    1.987097
8893    3.708661
8894    3.178571
8895    4.409091
8896    4.795455
Name: Total Cholestrol_HDL Ratio, Length: 8897, dtype: float64

In [36]:
final_df['Total Cholestrol_HDL Ratio'].describe()

count    8897.000000
mean        3.536219
std         1.129474
min         1.268868
25%         2.901639
50%         3.368882
75%         3.862500
max        27.846154
Name: Total Cholestrol_HDL Ratio, dtype: float64

### Triglyceride HDL ratio 

In [37]:
# Triglyceride HDL ratio = triglyceride level ÷ HDL
# high ratio (>6) indicates an elevated risk of heart attack and stroke

final_df['Triglyceride_HDL Ratio'] = final_df['Triglyceride [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['Triglyceride_HDL Ratio']

0       0.883438
1       0.883438
2       0.787064
3       0.999958
4       0.693153
          ...   
8892    0.568387
8893    0.960589
8894    0.395000
8895    1.386305
8896    3.451136
Name: Triglyceride_HDL Ratio, Length: 8897, dtype: float64

In [38]:
final_df['Triglyceride_HDL Ratio'].describe()

count    8897.000000
mean        0.962990
std         0.829942
min         0.055941
25%         0.742143
50%         0.883438
75%         1.025167
max        40.402667
Name: Triglyceride_HDL Ratio, dtype: float64

### Complete Blood Count Ratios

### Lymphocyte to Monocyte Ratio (LMR)

In [39]:
# Recently, numerous studies focused on proportion of different types of leukocytes in various medical conditions
# Neutrophil to lymphocyte ratio (NLR) and lymphocyte to monocyte ratio (LMR) seem most valuable parameters
# These ratios are prognostic factors in several cancers

# The LMR is calculated by dividing the absolute lymphocyte counts by the absolute monocyte counts
# We convert lymphocyte and moncyte percentages to absolute counts and then divide them to get LMR
# We do the conversion using the equation mentioned in the NHANES documentation

L = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Lymphocyte [%]'])/100).round(1)
M = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Monocyte [%]'])/100).round(1)
final_df['Lymphocyte_Monocyte Ratio'] = L/M
final_df['Lymphocyte_Monocyte Ratio']

0       4.166667
1       5.833333
2       5.666667
3       2.500000
4       6.000000
          ...   
8892    4.500000
8893    3.833333
8894    3.666667
8895    4.750000
8896    4.333333
Name: Lymphocyte_Monocyte Ratio, Length: 8897, dtype: float64

In [40]:
final_df['Lymphocyte_Monocyte Ratio'].describe()

count    8897.000000
mean        4.429318
std         2.361257
min         0.402985
25%         3.333333
50%         4.166667
75%         5.000000
max       128.142857
Name: Lymphocyte_Monocyte Ratio, dtype: float64

### Neutrophil Lymphocyte Ratio (NLR)

In [41]:
# Neutrophil Lymphocyte Ratio (NLR)
# NLR = Absolute Neutrophil Count (ANC) / Absolute Lymphocyte Count (ALC)

ANC = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Segmented neutrophils [%]'])/100).round(1)
ALC = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Lymphocyte [%]'])/100).round(1)
final_df['Neutrophil_Lymphocyte Ratio'] = ANC/ALC
final_df['Neutrophil_Lymphocyte Ratio']

0       1.600000
1       0.914286
2       1.235294
3       2.466667
4       1.452381
          ...   
8892    1.611111
8893    2.173913
8894    0.909091
8895    1.736842
8896    2.038462
Name: Neutrophil_Lymphocyte Ratio, Length: 8897, dtype: float64

In [42]:
final_df['Neutrophil_Lymphocyte Ratio'].describe()

count    8897.000000
mean        1.840041
std         1.066613
min         0.093333
25%         1.269231
50%         1.600000
75%         2.166667
max        21.500000
Name: Neutrophil_Lymphocyte Ratio, dtype: float64

### Platelet to Lymphocyte Ratio

In [43]:
# Platelet to Lymphocyte Ratio also acts as a prognostic factor in some cancers and cardiac diseases
# PLR is calculated by dividing platelet count by lymphocyte count. 

final_df['Platelet_Lymphocyte Ratio'] = final_df['Platelet count [1000 cells/uL]']/ALC
final_df['Platelet_Lymphocyte Ratio']

0       103.700053
1        68.285714
2        90.882353
3       155.333333
4        82.857143
           ...    
8892     89.444444
8893     98.695652
8894     65.909091
8895    100.526316
8896     61.153846
Name: Platelet_Lymphocyte Ratio, Length: 8897, dtype: float64

In [44]:
final_df['Platelet_Lymphocyte Ratio'].describe()

count    8897.000000
mean      115.509183
std        42.853881
min         0.668896
25%        91.600000
50%       103.700053
75%       132.000000
max       775.000000
Name: Platelet_Lymphocyte Ratio, dtype: float64

### Platelet-to-White Blood Cell Ratio

In [45]:
# Platelet and white blood cells counts can be converted into Platelet-to-White Blood Cell Ratio (PWR)
# White blood cells and platelets play an important role in the pathogenesis of ischemic stroke.
# Calculated by dividing the platelet count by the white blood cell count
# Lower PWR is also a promising prognostic marker for HBV (Hepatitis B)

final_df['Platelet_WBC Ratio'] = final_df['Platelet count [1000 cells/uL]']/final_df['White blood cell count [1000 cells/uL]']
final_df['Platelet_WBC Ratio']

0       35.113980
1       32.297297
2       35.930233
3       38.196721
4       31.071429
          ...    
8892    31.568627
8893    27.349398
8894    28.431373
8895    33.508772
8896    17.666667
Name: Platelet_WBC Ratio, Length: 8897, dtype: float64

In [46]:
final_df['Platelet_WBC Ratio'].describe()

count    8897.000000
mean       37.221852
std        11.797269
min         0.600000
25%        30.106383
50%        35.113980
75%        42.000000
max       120.232558
Name: Platelet_WBC Ratio, dtype: float64

### Lymphocyte to C-Reactive Protein Ratio

In [47]:
# Lymphocyte to C-Reactive Protein Ratio (LCR)
# LCR is a useful prognostic biomarker for some cancers like breast, gastric and bladder cancers
# It is calculated as: lymphocyte count (number/µL)/CRP level (mg/dL)

final_df['Lymphocyte_C-Reactive Protein Ratio'] = ALC/(final_df['C-Reactive Protein [mg/L]']/10)
final_df['Lymphocyte_C-Reactive Protein Ratio']

0         7.268033
1       120.689655
2        12.500000
3        20.270270
4       131.250000
           ...    
8892     40.000000
8893     12.500000
8894     27.160494
8895      7.436399
8896      7.027027
Name: Lymphocyte_C-Reactive Protein Ratio, Length: 8897, dtype: float64

In [48]:
final_df['Lymphocyte_C-Reactive Protein Ratio'].describe()

count    8897.000000
mean       27.976878
std        45.134550
min         0.076578
25%         7.125891
50%        10.843373
75%        31.818182
max      1318.000000
Name: Lymphocyte_C-Reactive Protein Ratio, dtype: float64

### Systemic Immune‐inflammation Index (SII) 

In [49]:
# The Systemic Immune‐inflammation Index (SII) 
# It is correlated with patient survival in various types of solid tumors and cancers
# SII = platelet count × neutrophil count/lymphocyte count

PLT = final_df['Platelet count [1000 cells/uL]']
N_R = final_df['Neutrophil_Lymphocyte Ratio']

final_df['Systemic Immune‐inflammation Index (SII)'] = PLT*N_R
final_df['Systemic Immune‐inflammation Index (SII)']

0       414.800213
1       218.514286
2       381.705882
3       574.733333
4       505.428571
           ...    
8892    259.388889
8893    493.478261
8894    131.818182
8895    663.473684
8896    324.115385
Name: Systemic Immune‐inflammation Index (SII), Length: 8897, dtype: float64

In [50]:
final_df['Systemic Immune‐inflammation Index (SII)'].describe()

count    8897.000000
mean      469.425515
std       293.877080
min        10.733333
25%       306.526316
50%       414.800213
75%       546.960000
max      4887.750000
Name: Systemic Immune‐inflammation Index (SII), dtype: float64

In [51]:
final_df.shape

(8897, 108)

In [52]:
# Now that we have done some feature engineering and have a newer dataset we can save it

final_df.to_csv('../Dataset/final_df.csv', index=False)