In [1]:
import pandas as pd

In [2]:
final_df = pd.read_csv('../Dataset/final_df.csv')
final_df.head()

Unnamed: 0,Respondent ID,Gender,Age,BMI,25-hydroxyvitamin D2 + D3 [nmol/L],Fasting Glucose [mmol/L],Insulin [pmol/L],Triglyceride [mmol/L],LDL-Cholesterol(NIH2) [mmol/L],Vitamin C [umol/L],...,Jaundice,Cancer,Cancer_Type,Anemia,Heart_Attack,Weak/Failing kidneys,Gestational diabetes,Diabetes,Hepatitis A antibody,Hepatitis B core antibody
0,93703.0,0.0,2.0,17.5,66.912952,6.212328,85.453019,1.219948,2.820133,54.172769,...,,,,2.0,,,,2.0,,
1,93704.0,1.0,2.0,15.7,74.4,6.212328,85.453019,1.219948,2.820133,54.172769,...,,,,1.0,,,,2.0,1.0,
2,93705.0,0.0,66.0,31.7,89.9,6.212328,85.453019,1.219948,2.820133,73.8,...,2.0,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,1.0
3,93706.0,1.0,18.0,21.5,53.8,6.212328,85.453019,1.219948,2.820133,63.6,...,2.0,,,2.0,,,,2.0,2.0,2.0
4,93707.0,1.0,13.0,18.1,58.2,6.212328,85.453019,1.219948,2.820133,27.4,...,2.0,,,2.0,,,,2.0,2.0,2.0


# Feature Engineering

## A/G Ratio

In [3]:
# We can do some feature engineering by creating new features from existing ones

# One such feature is the Albumin Globulin (A/G) Ratio
# A/G Ratio = Albumin in g/dL / (Total Protein in g/dL – Albumin in g/dL)
# Total Protein in g/dL – Albumin in g/dL = Globulin in g/dL
# This ratio is a measure of liver and kidney function

final_df['A/G Ratio'] = final_df['Albumin [g/dL]']/final_df['Globulin [g/dL]']
final_df['A/G Ratio']

0       1.321159
1       1.321159
2       1.517241
3       1.629630
4       1.857143
          ...   
9249    1.846154
9250    1.142857
9251    1.500000
9252    1.205882
9253    1.482759
Name: A/G Ratio, Length: 9254, dtype: float64

In [4]:
final_df['A/G Ratio'].describe()

count    9254.000000
mean        1.340888
std         0.199075
min         0.403846
25%         1.275862
50%         1.321159
75%         1.407407
max         2.571429
Name: A/G Ratio, dtype: float64

## Cholestrol Ratios

### Non-HDL Ratio

In [5]:
# Cholesterol ratios can help calculate a person’s risk of stroke and heart attack due to cholesterol.

# Non-HDL cholesterol level may be a better risk predictor than your total cholesterol level or even your LDL
# We calculate the ratio after converting mmol/L to mg/dL

final_df['Non_HDL Ratio'] = (final_df['Total Cholesterol [mmol/L]']/0.02586) - (final_df['Direct HDL-Cholesterol [mmol/L]']/0.02586)
final_df['Non_HDL Ratio']

0       126.497004
1       126.497004
2        97.061098
3       100.928074
4       121.036350
           ...    
9249     59.164733
9250    133.023975
9251    117.942769
9252    116.009281
9253    129.156999
Name: Non_HDL Ratio, Length: 9254, dtype: float64

In [6]:
final_df['Non_HDL Ratio'].describe()

count    9254.000000
mean      126.497004
std        34.621954
min        22.041763
25%       105.955143
50%       126.497004
75%       138.051044
max       408.352668
Name: Non_HDL Ratio, dtype: float64

### LDL-HDL ratio

In [7]:
# LDL-HDL ratio = LDL ÷ HDL
# It is one of the measures to see a person’s risk of heart disease

final_df['LDL_HDL Ratio'] = final_df['LDL-Cholesterol(NIH2) [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['LDL_HDL Ratio']

0       2.042228
1       2.042228
2       1.819441
3       2.311585
4       1.602348
          ...   
9249    0.717419
9250    2.220577
9251    1.995000
9252    3.204697
9253    2.468182
Name: LDL_HDL Ratio, Length: 9254, dtype: float64

In [8]:
final_df['LDL_HDL Ratio'].describe()

count    9254.000000
mean        2.152326
std         0.673112
min         0.205682
25%         1.843224
50%         2.042228
75%         2.369860
max        21.517857
Name: LDL_HDL Ratio, dtype: float64

### Total cholesterol HDL ratio

In [9]:
# Total cholesterol HDL ratio = total cholesterol ÷ HDL
# Higher ratios (>5) mean a higher risk of heart disease

final_df['Total Cholestrol_HDL Ratio'] = final_df['Total Cholesterol [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['Total Cholestrol_HDL Ratio']

0       3.368882
1       3.368882
2       2.619355
3       3.139344
4       2.778409
          ...   
9249    1.987097
9250    3.708661
9251    3.178571
9252    4.409091
9253    4.795455
Name: Total Cholestrol_HDL Ratio, Length: 9254, dtype: float64

In [10]:
final_df['Total Cholestrol_HDL Ratio'].describe()

count    9254.000000
mean        3.529763
std         1.107940
min         1.268868
25%         2.930739
50%         3.368882
75%         3.815534
max        27.846154
Name: Total Cholestrol_HDL Ratio, dtype: float64

### Triglyceride HDL ratio 

In [11]:
# Triglyceride HDL ratio = triglyceride level ÷ HDL
# high ratio (>6) indicates an elevated risk of heart attack and stroke

final_df['Triglyceride_HDL Ratio'] = final_df['Triglyceride [mmol/L]']/final_df['Direct HDL-Cholesterol [mmol/L]']
final_df['Triglyceride_HDL Ratio']

0       0.883438
1       0.883438
2       0.787064
3       0.999958
4       0.693153
          ...   
9249    0.568387
9250    0.960589
9251    0.395000
9252    1.386305
9253    3.451136
Name: Triglyceride_HDL Ratio, Length: 9254, dtype: float64

In [12]:
final_df['Triglyceride_HDL Ratio'].describe()

count    9254.000000
mean        0.959921
std         0.813919
min         0.055941
25%         0.752464
50%         0.883438
75%         1.016429
max        40.402667
Name: Triglyceride_HDL Ratio, dtype: float64

### Complete Blood Count Ratios

### Lymphocyte to Monocyte Ratio (LMR)

In [13]:
# Recently, numerous studies focused on proportion of different types of leukocytes in various medical conditions
# Neutrophil to lymphocyte ratio (NLR) and lymphocyte to monocyte ratio (LMR) seem most valuable parameters
# These ratios are prognostic factors in several cancers

# The LMR is calculated by dividing the absolute lymphocyte counts by the absolute monocyte counts
# We convert lymphocyte and moncyte percentages to absolute counts and then divide them to get LMR

L = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Lymphocyte [%]'])/100).round(1)
M = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Monocyte [%]'])/100).round(1)
final_df['Lymphocyte_Monocyte Ratio'] = L/M
final_df['Lymphocyte_Monocyte Ratio']

0       4.166667
1       5.833333
2       5.666667
3       2.500000
4       6.000000
          ...   
9249    4.500000
9250    3.833333
9251    3.666667
9252    4.750000
9253    4.333333
Name: Lymphocyte_Monocyte Ratio, Length: 9254, dtype: float64

In [14]:
final_df['Lymphocyte_Monocyte Ratio'].describe()

count    9254.000000
mean        4.419185
std         2.315810
min         0.402985
25%         3.400000
50%         4.166667
75%         5.000000
max       128.142857
Name: Lymphocyte_Monocyte Ratio, dtype: float64

### Neutrophil Lymphocyte Ratio (NLR)

In [15]:
# Neutrophil Lymphocyte Ratio (NLR)
# NLR = Absolute Neutrophil Count (ANC) / Absolute Lymphocyte Count (ALC)

ANC = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Segmented neutrophils [%]'])/100).round(1)
ALC = ((final_df['White blood cell count [1000 cells/uL]']*final_df['Lymphocyte [%]'])/100).round(1)
final_df['Neutrophil_Lymphocyte Ratio'] = ANC/ALC
final_df['Neutrophil_Lymphocyte Ratio']

0       1.600000
1       0.914286
2       1.235294
3       2.466667
4       1.452381
          ...   
9249    1.611111
9250    2.173913
9251    0.909091
9252    1.736842
9253    2.038462
Name: Neutrophil_Lymphocyte Ratio, Length: 9254, dtype: float64

In [16]:
final_df['Neutrophil_Lymphocyte Ratio'].describe()

count    9254.000000
mean        1.830781
std         1.046856
min         0.093333
25%         1.294118
50%         1.600000
75%         2.130435
max        21.500000
Name: Neutrophil_Lymphocyte Ratio, dtype: float64

### Platelet to Lymphocyte Ratio

In [17]:
# Platelet to Lymphocyte Ratio also acts as a prognostic factor in some cancers and cardiac diseases
# PLR is calculated by dividing platelet count by lymphocyte count. 

final_df['Platelet_Lymphocyte Ratio'] = final_df['Platelet count [1000 cells/uL]']/ALC
final_df['Platelet_Lymphocyte Ratio']

0       103.700053
1        68.285714
2        90.882353
3       155.333333
4        82.857143
           ...    
9249     89.444444
9250     98.695652
9251     65.909091
9252    100.526316
9253     61.153846
Name: Platelet_Lymphocyte Ratio, Length: 9254, dtype: float64

In [18]:
final_df['Platelet_Lymphocyte Ratio'].describe()

count    9254.000000
mean      115.053611
std        42.080563
min         0.668896
25%        92.560055
50%       103.700053
75%       130.470779
max       775.000000
Name: Platelet_Lymphocyte Ratio, dtype: float64

### Platelet-to-White Blood Cell Ratio

In [19]:
# Platelet and white blood cells counts can be converted into Platelet-to-White Blood Cell Ratio (PWR)
# White blood cells and platelets play an important role in the pathogenesis of ischemic stroke.
# Calculated by dividing the platelet count by the white blood cell count
# Lower PWR is also a promising prognostic marker for HBV (Hepatitis B)

final_df['Platelet_WBC Ratio'] = final_df['Platelet count [1000 cells/uL]']/final_df['White blood cell count [1000 cells/uL]']
final_df['Platelet_WBC Ratio']

0       35.113980
1       32.297297
2       35.930233
3       38.196721
4       31.071429
          ...    
9249    31.568627
9250    27.349398
9251    28.431373
9252    33.508772
9253    17.666667
Name: Platelet_WBC Ratio, Length: 9254, dtype: float64

In [20]:
final_df['Platelet_WBC Ratio'].describe()

count    9254.000000
mean       37.140535
std        11.574571
min         0.600000
25%        30.438027
50%        35.113980
75%        41.536199
max       120.232558
Name: Platelet_WBC Ratio, dtype: float64

### Lymphocyte to C-Reactive Protein Ratio

In [21]:
# Lymphocyte to C-Reactive Protein Ratio (LCR)
# LCR is a useful prognostic biomarker for some cancers like breast, gastric and bladder cancers
# It is calculated as: lymphocyte count (number/µL)/CRP level (mg/dL)

final_df['Lymphocyte_C-Reactive Protein Ratio'] = ALC/(final_df['C-Reactive Protein [mg/L]']/10)
final_df['Lymphocyte_C-Reactive Protein Ratio']

0         7.268033
1       120.689655
2        12.500000
3        20.270270
4       131.250000
           ...    
9249     40.000000
9250     12.500000
9251     27.160494
9252      7.436399
9253      7.027027
Name: Lymphocyte_C-Reactive Protein Ratio, Length: 9254, dtype: float64

In [22]:
final_df['Lymphocyte_C-Reactive Protein Ratio'].describe()

count    9254.000000
mean       27.177974
std        44.434658
min         0.076578
25%         7.268033
50%         9.884525
75%        30.144541
max      1318.000000
Name: Lymphocyte_C-Reactive Protein Ratio, dtype: float64

### Systemic Immune‐inflammation Index (SII) 

In [23]:
# The Systemic Immune‐inflammation Index (SII) 
# It is correlated with patient survival in various types of solid tumors and cancers
# SII = platelet count × neutrophil count/lymphocyte count

PLT = final_df['Platelet count [1000 cells/uL]']
N_R = final_df['Neutrophil_Lymphocyte Ratio']

final_df['Systemic Immune‐inflammation Index (SII)'] = PLT*N_R
final_df['Systemic Immune‐inflammation Index (SII)']

0       414.800213
1       218.514286
2       381.705882
3       574.733333
4       505.428571
           ...    
9249    259.388889
9250    493.478261
9251    131.818182
9252    663.473684
9253    324.115385
Name: Systemic Immune‐inflammation Index (SII), Length: 9254, dtype: float64

In [24]:
final_df['Systemic Immune‐inflammation Index (SII)'].describe()

count    9254.000000
mean      467.318185
std       288.344117
min        10.733333
25%       312.000000
50%       414.800213
75%       536.144796
max      4887.750000
Name: Systemic Immune‐inflammation Index (SII), dtype: float64

In [25]:
final_df.shape

(9254, 108)

In [26]:
# Now that we have done some feature engineering and have a newer dataset we can save it
final_df.to_csv('../Dataset/final_df.csv', index=False)