# import the necessary libraries

In [1]:
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression as lr
from sklearn.svm import SVR as svr
from sklearn.tree import DecisionTreeRegressor as dtr
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.neural_network import MLPRegressor as mlpr
from sklearn.neighbors import KNeighborsRegressor as knn

from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_absolute_error as MAE

# Demographic variables 2017

In [2]:
data_set_demo_2017 = pd.read_sas("C:/Users/user/Documents/veteran task/Demographic Variables_2017.XPT")

In [3]:
data_set_demo_2017.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,...,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,...,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,...,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,...,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,...,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


In [4]:
data_set_demo_2017.isnull().sum()

SEQN           0
SDDSRVYR       0
RIDSTATR       0
RIAGENDR       0
RIDAGEYR       0
RIDAGEMN    9276
RIDRETH1       0
RIDRETH3       0
RIDEXMON     427
RIDEXAGM    5911
DMQMILIZ    3822
DMQADFC     9444
DMDBORN4       0
DMDCITZN       2
DMDYRSUS    7735
DMDEDUC3    7324
DMDEDUC2    4252
DMDMARTL    4252
RIDEXPRG    8683
SIALANG        0
SIAPROXY       1
SIAINTRP       0
FIALANG      329
FIAPROXY     329
FIAINTRP     329
MIALANG     2994
MIAPROXY    2993
MIAINTRP    2993
AIALANGA    4009
DMDHHSIZ       0
DMDFMSIZ       0
DMDHHSZA       0
DMDHHSZB       0
DMDHHSZE       0
DMDHRGND       0
DMDHRAGE       0
DMDHRBR4     396
DMDHREDU     396
DMDHRMAR      62
DMDHSEDU    4745
WTINT2YR       0
WTMEC2YR       0
SDMVPSU        0
SDMVSTRA       0
INDHHIN2     345
INDFMIN2     329
INDFMPIR    1052
dtype: int64

# VARIABLE KEYS 


In [5]:
''''SEQN - Respondent sequence number 
RIAGENDR - Gender
RIDAGEYR - Age in years at screening
RIDRETH1 - Race/Hispanic origin
RIDRETH3 - Race/Hispanic origin w/ NH Asian
DMQMILIZ - Served active duty in US Armed Forces
DMDCITZN - Citizenship status
DMDEDUC2 - Education level - Adults 20+
DMDMARTL - Marital status
INDHHIN2 - Annual household income
INDFMIN2 - Annual family income'''

"'SEQN - Respondent sequence number \nRIAGENDR - Gender\nRIDAGEYR - Age in years at screening\nRIDRETH1 - Race/Hispanic origin\nRIDRETH3 - Race/Hispanic origin w/ NH Asian\nDMQMILIZ - Served active duty in US Armed Forces\nDMDCITZN - Citizenship status\nDMDEDUC2 - Education level - Adults 20+\nDMDMARTL - Marital status\nINDHHIN2 - Annual household income\nINDFMIN2 - Annual family income"

In [6]:
df1_demo_2017 = pd.DataFrame(data_set_demo_2017, columns=["SEQN", "SDDSRVYR", "RIDSTATR", "RIAGENDR", "RIDAGEYR", "RIDAGEMN", "RIDRETH1", "RIDRETH3", "RIDEXMON", "RIDEXAGM" , "DMQMILIZ", "DMQADFC", 
      "DMDBORN4", "DMDCITZN", "DMDYRSUS", "DMDEDUC3", "DMDEDUC2", "DMDMARTL", "RIDEXPRG", "SIALANG" , "SIAPROXY", "SIAINTRP", "FIALANG", "FIAPROXY",
      "FIAINTRP", "MIALANG", "MIAPROXY", "MIAINTRP", "AIALANGA", "DMDHHSIZ", "DMDFMSIZ", "DMDHHSZA", "DMDHHSZB", "DMDHHSZE", "DMDHRGND", "DMDHRAGE", "DMDHRBR4",
      "DMDHREDU", "DMDHRMAR", "DMDHSEDU", "WTINT2YR", "WTMEC2YR", "SDMVPSU", "SDMVSTRA", "INDHHIN2", "INDFMIN2", "INDFMPIR"])
df1_demo_2017.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,...,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,...,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,...,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,...,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,...,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


In [7]:
df1_demo_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9971 entries, 0 to 9970
Data columns (total 47 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      9971 non-null   float64
 1   SDDSRVYR  9971 non-null   float64
 2   RIDSTATR  9971 non-null   float64
 3   RIAGENDR  9971 non-null   float64
 4   RIDAGEYR  9971 non-null   float64
 5   RIDAGEMN  695 non-null    float64
 6   RIDRETH1  9971 non-null   float64
 7   RIDRETH3  9971 non-null   float64
 8   RIDEXMON  9544 non-null   float64
 9   RIDEXAGM  4060 non-null   float64
 10  DMQMILIZ  6149 non-null   float64
 11  DMQADFC   527 non-null    float64
 12  DMDBORN4  9971 non-null   float64
 13  DMDCITZN  9969 non-null   float64
 14  DMDYRSUS  2236 non-null   float64
 15  DMDEDUC3  2647 non-null   float64
 16  DMDEDUC2  5719 non-null   float64
 17  DMDMARTL  5719 non-null   float64
 18  RIDEXPRG  1288 non-null   float64
 19  SIALANG   9971 non-null   float64
 20  SIAPROXY  9970 non-null   floa

In [8]:
# keep only columns necessary for the Analysis
#keep_cols = df[["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "RIDRETH3", "DMQMILIZ", "DMDEDUC2", "DMDMARTL", "INDHHIN2", "INDFMIN2"]]


# drop all the other columns
df1_demo_2017 = df1_demo_2017.drop(columns=['SDDSRVYR', 'RIDSTATR', 'RIDAGEMN', 'RIDEXMON', 'RIDEXAGM', "DMQADFC", 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS',
                      'DMDEDUC3', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG',
                      'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND',
                      'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDFMPIR'])

In [9]:
df1_demo_2017.head()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,RIDRETH3,DMQMILIZ,DMDEDUC2,DMDMARTL,INDHHIN2,INDFMIN2
0,83732.0,1.0,62.0,3.0,3.0,2.0,5.0,1.0,10.0,10.0
1,83733.0,1.0,53.0,3.0,3.0,2.0,3.0,3.0,4.0,4.0
2,83734.0,1.0,78.0,3.0,3.0,1.0,3.0,1.0,5.0,5.0
3,83735.0,2.0,56.0,3.0,3.0,2.0,5.0,6.0,10.0,10.0
4,83736.0,2.0,42.0,4.0,4.0,2.0,4.0,3.0,7.0,7.0


In [10]:
df1_demo_2017.isnull().sum()

SEQN           0
RIAGENDR       0
RIDAGEYR       0
RIDRETH1       0
RIDRETH3       0
DMQMILIZ    3822
DMDEDUC2    4252
DMDMARTL    4252
INDHHIN2     345
INDFMIN2     329
dtype: int64

# Demographic variables 2015

In [11]:
data_set_demo_2015 = pd.read_sas("C:/Users/user/Documents/veteran task/Demographic Variables_2015.XPT")
data_set_demo_2015.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,73557.0,8.0,2.0,1.0,69.0,,4.0,4.0,1.0,,...,3.0,4.0,,13281.237386,13481.042095,1.0,112.0,4.0,4.0,0.84
1,73558.0,8.0,2.0,1.0,54.0,,3.0,3.0,1.0,,...,3.0,1.0,1.0,23682.057386,24471.769625,1.0,108.0,7.0,7.0,1.78
2,73559.0,8.0,2.0,1.0,72.0,,3.0,3.0,2.0,,...,4.0,1.0,3.0,57214.803319,57193.285376,1.0,109.0,10.0,10.0,4.51
3,73560.0,8.0,2.0,1.0,9.0,,3.0,3.0,1.0,119.0,...,3.0,1.0,4.0,55201.178592,55766.512438,2.0,109.0,9.0,9.0,2.52
4,73561.0,8.0,2.0,2.0,73.0,,3.0,3.0,1.0,,...,5.0,1.0,5.0,63709.667069,65541.871229,2.0,116.0,15.0,15.0,5.0


In [12]:
print(data_set_demo_2015.columns)

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC',
       'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL',
       'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY',
       'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ',
       'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE',
       'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR'],
      dtype='object')


In [13]:
data_set_demo_2015 = pd.DataFrame(data_set_demo_2015, columns=['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC',
       'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL',
       'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY',
       'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ',
       'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE',
       'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR'])

In [14]:
df1_demo_2015 = data_set_demo_2015.drop(columns=['SDDSRVYR', 'RIDSTATR', 'RIDAGEMN', 'RIDEXMON', 'RIDEXAGM', "DMQADFC", 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS',
                      'DMDEDUC3', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG',
                      'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND',
                      'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDFMPIR'])

In [15]:
df1_demo_2015.head()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,RIDRETH3,DMQMILIZ,DMDEDUC2,DMDMARTL,INDHHIN2,INDFMIN2
0,73557.0,1.0,69.0,4.0,4.0,1.0,3.0,4.0,4.0,4.0
1,73558.0,1.0,54.0,3.0,3.0,2.0,3.0,1.0,7.0,7.0
2,73559.0,1.0,72.0,3.0,3.0,1.0,4.0,1.0,10.0,10.0
3,73560.0,1.0,9.0,3.0,3.0,,,,9.0,9.0
4,73561.0,2.0,73.0,3.0,3.0,2.0,5.0,1.0,15.0,15.0


# Demographic variables 2014

In [16]:
data_set_demo_2014 = pd.read_sas("C:/Users/user/Documents/veteran task/Demographic Variables_2014.XPT")
data_set_demo_2014.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGY,...,DMDFMSIZ,DMDHHSZA,DMDHHSZB,DMDHHSZE,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU
0,62161.0,7.0,2.0,1.0,22.0,,3.0,3.0,2.0,,...,5.0,5.397605e-79,1.0,5.397605e-79,2.0,50.0,1.0,5.0,1.0,5.0
1,62162.0,7.0,2.0,2.0,3.0,,1.0,1.0,1.0,3.0,...,6.0,2.0,2.0,5.397605e-79,2.0,24.0,1.0,3.0,6.0,
2,62163.0,7.0,2.0,1.0,14.0,,5.0,6.0,2.0,14.0,...,5.0,5.397605e-79,2.0,1.0,1.0,42.0,1.0,5.0,1.0,4.0
3,62164.0,7.0,2.0,2.0,44.0,,3.0,3.0,1.0,,...,5.0,1.0,2.0,5.397605e-79,1.0,52.0,1.0,4.0,1.0,4.0
4,62165.0,7.0,2.0,2.0,14.0,,4.0,4.0,2.0,14.0,...,5.0,1.0,2.0,5.397605e-79,2.0,33.0,2.0,2.0,77.0,


In [17]:
print(data_set_demo_2014.columns)

Index(['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGY', 'RIDEXAGM', 'DMQMILIZ',
       'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2',
       'DMDMARTL', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG',
       'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA',
       'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2',
       'INDFMPIR', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE',
       'DMDHRGND', 'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU'],
      dtype='object')


In [18]:
data_set_demo_2014 = pd.DataFrame(data_set_demo_2014, columns=['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN',
       'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC',
       'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL',
       'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY',
       'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ',
       'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE',
       'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR'])

In [19]:
df1_demo_2014 = data_set_demo_2014.drop(columns=['SDDSRVYR', 'RIDSTATR', 'RIDAGEMN', 'RIDEXMON', 'RIDEXAGM', "DMQADFC", 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS',
                      'DMDEDUC3', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG',
                      'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND',
                      'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDFMPIR'])

In [20]:
df1_demo_2014.head()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,RIDRETH3,DMQMILIZ,DMDEDUC2,DMDMARTL,INDHHIN2,INDFMIN2
0,62161.0,1.0,22.0,3.0,3.0,2.0,3.0,5.0,14.0,14.0
1,62162.0,2.0,3.0,1.0,1.0,,,,4.0,4.0
2,62163.0,1.0,14.0,5.0,6.0,,,,15.0,15.0
3,62164.0,2.0,44.0,3.0,3.0,1.0,4.0,1.0,8.0,8.0
4,62165.0,2.0,14.0,4.0,4.0,,,,4.0,4.0


In [21]:
demography_dataset = pd.concat([df1_demo_2017, df1_demo_2015, df1_demo_2014])
demography_dataset.head()

Unnamed: 0,SEQN,RIAGENDR,RIDAGEYR,RIDRETH1,RIDRETH3,DMQMILIZ,DMDEDUC2,DMDMARTL,INDHHIN2,INDFMIN2
0,83732.0,1.0,62.0,3.0,3.0,2.0,5.0,1.0,10.0,10.0
1,83733.0,1.0,53.0,3.0,3.0,2.0,3.0,3.0,4.0,4.0
2,83734.0,1.0,78.0,3.0,3.0,1.0,3.0,1.0,5.0,5.0
3,83735.0,2.0,56.0,3.0,3.0,2.0,5.0,6.0,10.0,10.0
4,83736.0,2.0,42.0,4.0,4.0,2.0,4.0,3.0,7.0,7.0


In [22]:
demography_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29902 entries, 0 to 9755
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      29902 non-null  float64
 1   RIAGENDR  29902 non-null  float64
 2   RIDAGEYR  29902 non-null  float64
 3   RIDRETH1  29902 non-null  float64
 4   RIDRETH3  29902 non-null  float64
 5   DMQMILIZ  18417 non-null  float64
 6   DMDEDUC2  17048 non-null  float64
 7   DMDMARTL  17048 non-null  float64
 8   INDHHIN2  29343 non-null  float64
 9   INDFMIN2  29399 non-null  float64
dtypes: float64(10)
memory usage: 2.5 MB


In [23]:
demography_dataset.isnull().sum()

SEQN            0
RIAGENDR        0
RIDAGEYR        0
RIDRETH1        0
RIDRETH3        0
DMQMILIZ    11485
DMDEDUC2    12854
DMDMARTL    12854
INDHHIN2      559
INDFMIN2      503
dtype: int64

In [24]:
print(demography_dataset.columns)

Index(['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'RIDRETH3', 'DMQMILIZ',
       'DMDEDUC2', 'DMDMARTL', 'INDHHIN2', 'INDFMIN2'],
      dtype='object')


# Save the data to a CSV File

In [25]:
demography_dataset.to_csv('demography.csv', index=False)

In [26]:
import os

In [27]:
print(os.getcwd())

C:\Users\user
