# Feature extraction
## Modality 4: Laboratory tests, anthropometry metrics and vital signs
Each modality is a combination of features from the different CSV that conform the database.
- CN_MUESAN_DETBIO_ADU
    - VALOR_GLU_SUERO: glucose_value
    - VALOR_HB1AC: hba1c_value
    - VALOR_ALBUM: albumin_value
    - VALOR_COL_HDL: hdl_chol_value
    - VALOR_COL_LDL: ldl_chol_value
    - VALOR_COLEST: total_chol_value
    - VALOR_CREAT: creatinine_value
    - VALOR_INSULINA: insulin_value
    - VALOR_TRIG: trig_value

- CN_MUESAN_HEMOGLOBINA 
    - P2: hemoglobin_value

- CN_MUESAN_HEPA_ADU
    - VALOR_HEPA_B: hepatitis_B
    - VALOR_HEPA_C: hepatitis_C


- CN_ANTROPOMETRIA
    - PESO1_1: weight1
    - PESO1_2: weight2
    - TALLA4_1: height1
    - TALLA4_2: height2
    - CIRCUMFERENCIA8_1: waist1
    - CIRCUMFERENCIA8_2: waist2
    - PESO12_1: weight1_elderly
    - PESO12_2: weight2_elderly
    - TALLA15_1: height1_elderly
    - TALLA15_2: height2_elderly
    - CIRCUMFERENCIA8_1: waist1_elderly
    - CIRCUMFERENCIA8_2: waist2_elderly
    - P27_1_1: bp1_sistolic
    - P27_2_1: bp2_sistolic
    - P27_1_2: bp1_diastolic
    - P27_2_2: bp2_diastolic

- CS_ACT_FIS_ADO
    - P3_1H: intense_phy_act
    - P6_1H: moderate_phy_act
    - P9_1H: walking
    - P11: sedentarism

## Setup

### Libraries

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

### Paths

In [2]:
input_path = '../../0_source_csv/ensanut/'
input_path_sample = '../output'

output_path = '../output/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Set local variables

In [3]:
exec(open(glob_conf_path).read())

## Extract features

In [4]:
# Keep only the features selected and rename them with unique feature name
blood_dataset = pd.read_csv(os.path.join(input_path, "CN_MUESAN_DETBIO_ADU.csv"), skipinitialspace=True)
blood_dataset = blood_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","VALOR_GLU_SUERO","VALOR_HB1AC","VALOR_ALBUM","VALOR_COL_HDL",
                               "VALOR_COL_LDL","VALOR_COLEST","VALOR_CREAT","VALOR_INSULINA","VALOR_TRIG"]]

blood_dataset = blood_dataset.rename(columns={"VALOR_GLU_SUERO": "glucose_value",
                                              "VALOR_HB1AC": "hba1c_value",
                                              "VALOR_ALBUM": "albumin_value",
                                              "VALOR_COL_HDL": "hdl_chol_value",
                                              "VALOR_COL_LDL": "ldl_chol_value",
                                              "VALOR_COLEST": "total_chol_value",
                                              "VALOR_CREAT": "creatinine_value",
                                              "VALOR_INSULINA": "insulin_value",
                                              "VALOR_TRIG": "trig_value"})

# Add column with primary keys for house and household
blood_dataset["household_ID"] = blood_dataset["UPM"].astype(str)+'_'+blood_dataset["VIV_SEL"].astype(str)+'_'+blood_dataset["HOGAR"].astype(str)
blood_dataset["person_ID"] = blood_dataset["UPM"].astype(str)+'_'+blood_dataset["VIV_SEL"].astype(str)+'_'+blood_dataset["HOGAR"].astype(str)+'_'+blood_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del blood_dataset["UPM"]
del blood_dataset["VIV_SEL"]
del blood_dataset["HOGAR"]
del blood_dataset["NUMREN"]

blood_dataset

Unnamed: 0,glucose_value,hba1c_value,albumin_value,hdl_chol_value,ldl_chol_value,total_chol_value,creatinine_value,insulin_value,trig_value,household_ID,person_ID
0,75,4.6,3.8,60,83,167,.54,6.5,120,1_4_1,1_4_1_4
1,91,5.2,4.5,52,124,197,.73,5.1,103,1_5_1,1_5_1_2
2,94,5.0,4.5,41,89,149,1.01,8.1,97,2_2_1,2_2_1_2
3,75,5.3,4.0,42,98,178,.69,26.4,191,3_2_1,3_2_1_1
4,145,6.4,4.2,41,120,209,.89,7.7,240,3_4_1,3_4_1_1
...,...,...,...,...,...,...,...,...,...,...,...
13215,88,5.3,4.4,60,142,211,.72,3.7,45,3938_5_1,3938_5_1_2
13216,87,5.2,4.2,42,113,180,.84,19.7,127,3938_8_1,3938_8_1_1
13217,143,5.2,4.7,48,,185,1.07,43.7,281,3938_10_1,3938_10_1_1
13218,92,5.5,4.4,37,,163,1.27,32.8,360,3938_15_1,3938_15_1_1


In [5]:
# Keep only the features selected and rename them with unique feature name
# Import datasets
hemoglobin_dataset = pd.read_csv(os.path.join(input_path, "CN_MUESAN_HEMOGLOBINA.csv"), skipinitialspace=True)

hemoglobin_dataset = hemoglobin_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","P2"]]

hemoglobin_dataset = hemoglobin_dataset.rename(columns={"EDAD":"age",
                                                        "P2": "hemoglobin_value"})

# Add column with primary keys for house and household
hemoglobin_dataset["household_ID"] = hemoglobin_dataset["UPM"].astype(str)+'_'+hemoglobin_dataset["VIV_SEL"].astype(str)+'_'+hemoglobin_dataset["HOGAR"].astype(str)
hemoglobin_dataset["person_ID"] = hemoglobin_dataset["UPM"].astype(str)+'_'+hemoglobin_dataset["VIV_SEL"].astype(str)+'_'+hemoglobin_dataset["HOGAR"].astype(str)+'_'+hemoglobin_dataset["NUMREN"].astype(str)

#Keep only adults from 20 years old (60 to 228 months)
hemoglobin_dataset = hemoglobin_dataset[(hemoglobin_dataset.age >= 20)]

# Delete unnecesary columns 
del hemoglobin_dataset["UPM"]
del hemoglobin_dataset["VIV_SEL"]
del hemoglobin_dataset["HOGAR"]
del hemoglobin_dataset["NUMREN"]
del hemoglobin_dataset["age"]

hemoglobin_dataset

Unnamed: 0,hemoglobin_value,household_ID,person_ID
0,14.7,1_1_1,1_1_1_2
1,12.9,1_4_1,1_4_1_4
4,14.0,1_5_1,1_5_1_2
5,16.1,2_2_1,2_2_1_2
6,15.0,3_2_1,3_2_1_1
...,...,...,...
30328,16.0,3938_15_1,3938_15_1_1
30330,18.4,3938_16_1,3938_16_1_4
30331,11.2,3938_17_1,3938_17_1_2
30334,15.0,3938_19_1,3938_19_1_1


In [6]:
# Keep only the features selected and rename them with unique feature name
hepatitis_dataset = pd.read_csv(os.path.join(input_path, "CN_MUESAN_HEPA_ADU.csv"), skipinitialspace=True)

hepatitis_dataset = hepatitis_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","VALOR_HEPA_B","VALOR_HEPA_C"]]

hepatitis_dataset = hepatitis_dataset.rename(columns={"EDAD":"age",
                                                      "VALOR_HEPA_B": "hepatitis_B",
                                                      "VALOR_HEPA_C": "hepatitis_C"})

# Add column with primary keys for house and household
hepatitis_dataset["household_ID"] = hepatitis_dataset["UPM"].astype(str)+'_'+hepatitis_dataset["VIV_SEL"].astype(str)+'_'+hepatitis_dataset["HOGAR"].astype(str)
hepatitis_dataset["person_ID"] = hepatitis_dataset["UPM"].astype(str)+'_'+hepatitis_dataset["VIV_SEL"].astype(str)+'_'+hepatitis_dataset["HOGAR"].astype(str)+'_'+hepatitis_dataset["NUMREN"].astype(str)

#Keep only adults from 20 years old 
hepatitis_dataset = hepatitis_dataset[(hepatitis_dataset.age >= 20)]

# Delete unnecesary columns 
del hepatitis_dataset["UPM"]
del hepatitis_dataset["VIV_SEL"]
del hepatitis_dataset["HOGAR"]
del hepatitis_dataset["NUMREN"]
del hepatitis_dataset["age"]

hepatitis_dataset

Unnamed: 0,hepatitis_B,hepatitis_C,household_ID,person_ID
0,,2,2_2_1,2_2_1_2
1,,2,3_2_1,3_2_1_1
2,,2,3_4_1,3_4_1_1
3,,2,4_4_1,4_4_1_2
4,,2,4_5_1,4_5_1_1
...,...,...,...,...
12384,,2,3938_5_1,3938_5_1_2
12385,,2,3938_8_1,3938_8_1_1
12386,,2,3938_10_1,3938_10_1_1
12387,,2,3938_15_1,3938_15_1_1


In [7]:
# Keep only the features selected and rename them with unique feature name
anthropometry_dataset = pd.read_csv(os.path.join(input_path, "CN_ANTROPOMETRIA.csv"), skipinitialspace=True)

anthropometry_dataset = anthropometry_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","PESO1_1","PESO1_2","TALLA4_1","TALLA4_2",
                                               "CIRCUNFERENCIA8_1","CIRCUNFERENCIA8_2","PESO12_1","PESO12_2","TALLA15_1",
                                               "TALLA15_2","CINTURA21_1","CINTURA21_2","P27_1_1","P27_2_1",
                                               "P27_1_2","P27_2_2"]]

anthropometry_dataset = anthropometry_dataset.rename(columns={"EDAD":"age",
                                                              "PESO1_1": "weight1",
                                                              "PESO1_2": "weight2",
                                                              "TALLA4_1": "height1",
                                                              "TALLA4_2": "height2",
                                                              "CIRCUNFERENCIA8_1": "waist1",
                                                              "CIRCUNFERENCIA8_2": "waist2",
                                                              "PESO12_1": "weight1_elderly",
                                                              "PESO12_2": "weight2_elderly",
                                                              "TALLA15_1": "height1_elderly",
                                                              "TALLA15_2": "height2_elderly",
                                                              "CINTURA21_1": "waist1_elderly",
                                                              "CINTURA21_2": "waist2_elderly",
                                                              "P27_1_1": "bp1_sistolic",
                                                              "P27_2_1": "bp2_sistolic",
                                                              "P27_1_2": "bp1_diastolic",
                                                              "P27_2_2": "bp2_diastolic"})

# Add column with primary keys for house and household
anthropometry_dataset["household_ID"] = anthropometry_dataset["UPM"].astype(str)+'_'+anthropometry_dataset["VIV_SEL"].astype(str)+'_'+anthropometry_dataset["HOGAR"].astype(str)
anthropometry_dataset["person_ID"] = anthropometry_dataset["UPM"].astype(str)+'_'+anthropometry_dataset["VIV_SEL"].astype(str)+'_'+anthropometry_dataset["HOGAR"].astype(str)+'_'+anthropometry_dataset["NUMREN"].astype(str)

#Keep only adults from 20 years old 
anthropometry_dataset = anthropometry_dataset[(anthropometry_dataset.age >= 20)]

# Delete unnecesary columns 
del anthropometry_dataset["UPM"]
del anthropometry_dataset["VIV_SEL"]
del anthropometry_dataset["HOGAR"]
del anthropometry_dataset["NUMREN"]
del anthropometry_dataset["age"]

anthropometry_dataset

Unnamed: 0,weight1,weight2,height1,height2,waist1,waist2,weight1_elderly,weight2_elderly,height1_elderly,height2_elderly,waist1_elderly,waist2_elderly,bp1_sistolic,bp2_sistolic,bp1_diastolic,bp2_diastolic,household_ID,person_ID
0,,,,,,,60.35,60.30,147.2,147.2,104.8,104.8,115.0,121.0,69.0,72.0,1_1_1,1_1_1_2
1,74.05,74.05,148.0,148.0,101.5,101.5,,,,,,,86.0,76.0,48.0,47.0,1_4_1,1_4_1_4
4,63.65,63.60,152.1,152.0,93.7,93.7,,,,,,,101.0,102.0,74.0,75.0,1_5_1,1_5_1_2
5,68.30,68.30,179.4,179.5,83.2,83.3,,,,,,,117.0,123.0,73.0,74.0,2_2_1,2_2_1_2
6,,,,,,,90.60,90.55,159.9,159.8,123.8,123.9,115.0,114.0,69.0,71.0,3_2_1,3_2_1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33807,78.10,78.05,179.0,179.0,88.2,88.2,,,,,,,222.0,,222.0,,3938_16_1,3938_16_1_4
33808,69.90,69.90,156.7,156.6,96.5,95.7,,,,,,,131.0,119.0,85.0,80.0,3938_17_1,3938_17_1_2
33811,,,,,,,86.70,86.80,157.4,157.4,104.0,104.2,123.0,209.0,97.0,135.0,3938_18_1,3938_18_1_1
33812,62.85,62.85,174.0,173.9,77.0,77.2,,,,,,,138.0,118.0,66.0,54.0,3938_19_1,3938_19_1_1


Before extracting the physical activity dataset, the format of the features needs to be known. This because the descriptions inthe catalogue are not clear. 

In [8]:
phy_act_dataset = pd.read_csv(os.path.join(input_path, "CS_ACT_FIS_ADO.csv"), skipinitialspace=True)
phy_act_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47659 entries, 0 to 47658
Data columns (total 27 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   UPM         47659 non-null  int64  
 1   VIV_SEL     47659 non-null  int64  
 2   HOGAR       47659 non-null  int64  
 3   NUMREN      47659 non-null  int64  
 4   P1          47659 non-null  int64  
 5   P2          19691 non-null  object 
 6   P3_1H       22 non-null     float64
 7   P3_1M       22 non-null     float64
 8   P4          47624 non-null  float64
 9   P5          34643 non-null  object 
 10  P6_1H       72 non-null     float64
 11  P6_1M       72 non-null     float64
 12  P7          47624 non-null  float64
 13  P8          40943 non-null  object 
 14  P9_1H       67 non-null     float64
 15  P9_1M       67 non-null     float64
 16  P10         47624 non-null  object 
 17  P11         206 non-null    object 
 18  EDAD        47659 non-null  int64  
 19  SEXO        47659 non-nul

In [9]:
phy_act_dataset

Unnamed: 0,UPM,VIV_SEL,HOGAR,NUMREN,P1,P2,P3_1H,P3_1M,P4,P5,...,P11,EDAD,SEXO,ENT,DOMINIO,REGION,EST_DIS,UPM_DIS,ESTRATO,F_ACFISADO
0,42,1,1,1,4,00:30,,,1.0,04:00,...,,37,1,1,1,2,6,42,3,548
1,42,4,1,2,0,,,,7.0,04:00,...,,37,1,1,1,2,6,42,3,548
2,42,5,1,1,0,,,,7.0,04:00,...,,64,2,1,1,2,6,42,3,274
3,43,3,1,1,2,01:00,,,0.0,,...,,36,1,1,1,2,7,43,4,590
4,43,4,1,2,1,01:00,,,3.0,01:00,...,,43,2,1,1,2,7,43,4,590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47654,6069,13,1,1,0,,,,7.0,00:25,...,,66,1,29,1,4,286,6069,2,441
47655,6069,14,1,1,0,,,,7.0,01:00,...,,60,2,29,1,4,286,6069,2,662
47656,6069,15,1,2,0,,,,0.0,,...,,24,1,29,1,4,286,6069,2,662
47657,6069,16,1,2,0,,,,7.0,08:00,...,,38,2,29,1,4,286,6069,2,441


In [10]:
# Keep only the features selected and rename them with unique feature name
phy_act_dataset = phy_act_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","P1","P4","P7","P10"]]

phy_act_dataset = phy_act_dataset.rename(columns={"EDAD":"age",
                                                  "P1": "days_intense_phy_act",
                                                  "P4": "days_moderate_phy_act",
                                                  "P7": "days_walking",
                                                  "P10": "daily_hours_sitted_down"})

# Add column with primary keys for house and household
phy_act_dataset["household_ID"] = phy_act_dataset["UPM"].astype(str)+'_'+phy_act_dataset["VIV_SEL"].astype(str)+'_'+phy_act_dataset["HOGAR"].astype(str)
phy_act_dataset["person_ID"] = phy_act_dataset["UPM"].astype(str)+'_'+phy_act_dataset["VIV_SEL"].astype(str)+'_'+phy_act_dataset["HOGAR"].astype(str)+'_'+phy_act_dataset["NUMREN"].astype(str)

#Keep only adults from 20 years old 
phy_act_dataset = phy_act_dataset[(phy_act_dataset.age >= 20)]

# Delete unnecesary columns 
del phy_act_dataset["UPM"]
del phy_act_dataset["VIV_SEL"]
del phy_act_dataset["HOGAR"]
del phy_act_dataset["NUMREN"]
del phy_act_dataset["age"]

phy_act_dataset

Unnamed: 0,days_intense_phy_act,days_moderate_phy_act,days_walking,daily_hours_sitted_down,household_ID,person_ID
0,4,1.0,7.0,01:30,42_1_1,42_1_1_1
1,0,7.0,0.0,00:30,42_4_1,42_4_1_2
2,0,7.0,3.0,00:30,42_5_1,42_5_1_1
3,2,0.0,2.0,08:00,43_3_1,43_3_1_1
4,1,3.0,6.0,06:00,43_4_1,43_4_1_2
...,...,...,...,...,...,...
47654,0,7.0,1.0,03:00,6069_13_1,6069_13_1_1
47655,0,7.0,7.0,01:30,6069_14_1,6069_14_1_1
47656,0,0.0,1.0,02:00,6069_15_1,6069_15_1_2
47657,0,7.0,5.0,01:00,6069_16_1,6069_16_1_2


## Merge based on key

In [11]:
# Preparation of sample dataset
sample_dataset = pd.read_csv(os.path.join(input_path_sample, "sample_dataset.csv"))

Since the blood tests dataset is the most relevant in terms of potential feature importance (contains laboratory tests for glucose, cholesterol, triglycerides, etc.), the concatenation will start from there. The concatenation order will be in terms of potential feature importance:
- blood tests
- anthropometry
- physical activity
- hemoglobin
- hepatitis

First concatenate the adult information, then compare it to the sample dataset. 

In [12]:
adult_biometrics_dataset = blood_dataset.merge(
    anthropometry_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
    phy_act_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
        hemoglobin_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
            hepatitis_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')
        )
    
adult_biometrics_dataset.shape

(13220, 38)

In [13]:
# Drop repeated columns
adult_biometrics_dataset.drop([i for i in adult_biometrics_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
adult_biometrics_dataset.columns

Index(['glucose_value', 'hba1c_value', 'albumin_value', 'hdl_chol_value',
       'ldl_chol_value', 'total_chol_value', 'creatinine_value',
       'insulin_value', 'trig_value', 'household_ID', 'person_ID', 'weight1',
       'weight2', 'height1', 'height2', 'waist1', 'waist2', 'weight1_elderly',
       'weight2_elderly', 'height1_elderly', 'height2_elderly',
       'waist1_elderly', 'waist2_elderly', 'bp1_sistolic', 'bp2_sistolic',
       'bp1_diastolic', 'bp2_diastolic', 'days_intense_phy_act',
       'days_moderate_phy_act', 'days_walking', 'daily_hours_sitted_down',
       'hemoglobin_value', 'hepatitis_B', 'hepatitis_C'],
      dtype='object')

In [14]:
print("Duplicated rows: ", adult_biometrics_dataset.duplicated().sum())
adult_biometrics_dataset.shape

Duplicated rows:  0


(13220, 34)

In [15]:
m4_dataset = sample_dataset.merge(adult_biometrics_dataset, left_on="household_ID", right_on="household_ID", how="left", suffixes=('', '_remove'))

# Drop repeated columns
m4_dataset.drop([i for i in m4_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
m4_dataset.columns

Index(['house_ID', 'household_ID', 'person_ID', 'region', 'strata',
       'locality_type', 'locality_size', 'age_months', 'age_years', 'sex',
       'BMI_SD', 'label_cat', 'label', 'glucose_value', 'hba1c_value',
       'albumin_value', 'hdl_chol_value', 'ldl_chol_value', 'total_chol_value',
       'creatinine_value', 'insulin_value', 'trig_value', 'weight1', 'weight2',
       'height1', 'height2', 'waist1', 'waist2', 'weight1_elderly',
       'weight2_elderly', 'height1_elderly', 'height2_elderly',
       'waist1_elderly', 'waist2_elderly', 'bp1_sistolic', 'bp2_sistolic',
       'bp1_diastolic', 'bp2_diastolic', 'days_intense_phy_act',
       'days_moderate_phy_act', 'days_walking', 'daily_hours_sitted_down',
       'hemoglobin_value', 'hepatitis_B', 'hepatitis_C'],
      dtype='object')

In [16]:
m4_dataset.shape

(10301, 45)

In [17]:
m4_dataset

Unnamed: 0,house_ID,household_ID,person_ID,region,strata,locality_type,locality_size,age_months,age_years,sex,...,bp2_sistolic,bp1_diastolic,bp2_diastolic,days_intense_phy_act,days_moderate_phy_act,days_walking,daily_hours_sitted_down,hemoglobin_value,hepatitis_B,hepatitis_C
0,1_4,1_4_1,1_4_1_8,Centre,3rd_strata,urban,">100,000",122,10,female,...,76.0,48.0,47.0,0.0,0.0,3.0,04:00,12.9,,
1,3_2,3_2_1,3_2_1_5,Centre,2nd_strata,urban,">100,000",149,12,female,...,114.0,69.0,71.0,0.0,0.0,7.0,03:00,15.0,,2.0
2,4_1,4_1_1,4_1_1_4,Centre,2nd_strata,urban,">100,000",162,13,male,...,,,,,,,,,,
3,4_3,4_3_1,4_3_1_5,Centre,2nd_strata,urban,">100,000",197,16,male,...,,,,,,,,,,
4,4_5,4_5_1,4_5_1_5,Centre,2nd_strata,urban,">100,000",169,14,male,...,105.0,97.0,95.0,0.0,6.0,7.0,03:00,14.2,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10296,3938_17,3938_17_1,3938_17_1_3,Centre,2nd_strata,rural,"<2,500",130,10,male,...,,,,,,,,,,
10297,3938_17,3938_17_1,3938_17_1_4,Centre,2nd_strata,rural,"<2,500",119,9,female,...,,,,,,,,,,
10298,3938_19,3938_19_1,3938_19_1_3,Centre,2nd_strata,rural,"<2,500",170,14,male,...,118.0,66.0,54.0,1.0,7.0,7.0,00:30,15.0,,2.0
10299,3938_19,3938_19_1,3938_19_1_5,Centre,2nd_strata,rural,"<2,500",89,7,female,...,118.0,66.0,54.0,1.0,7.0,7.0,00:30,15.0,,2.0


## Export dataset

In [18]:
#Export dataset as a csv
m4_dataset.to_csv(os.path.join(output_path,'m4_feature_extraction.csv'), index=None, header=True)