## Import and conversion

In [10]:
import xport
import pandas as pd
import csv

In [11]:
# Reading the BRFSS 2018 data: https://www.cdc.gov/brfss/annual_data/annual_2018.html
# Converting it from a "*.xpt" (XPORT) file to a dataframe
# With the help of documentation available at: https://pypi.org/project/xport/
with open('LLCP2018.XPT ', 'rb') as file:
    df = xport.to_dataframe(file)
    print(df)

KeyboardInterrupt: 

In [13]:
# Selecting variables (columns) from the original dataframe
selectedColsDF = df.loc[:, ('_RFBMI5', '_EDUCAG', '_INCOMG', 'EXERANY2', '_AGEG5YR', 'SEX1', 'DRNKANY5', '_DRDXAR1')]
selectedColsDF

Unnamed: 0,_RFBMI5,_EDUCAG,_INCOMG,EXERANY2,_AGEG5YR,SEX1,DRNKANY5,_DRDXAR1
0,1.0,4.0,4.0,2.0,13.0,2.0,2.0,1.0
1,2.0,4.0,2.0,1.0,3.0,2.0,1.0,2.0
2,2.0,2.0,2.0,1.0,12.0,2.0,2.0,2.0
3,2.0,2.0,2.0,1.0,10.0,1.0,2.0,2.0
4,9.0,4.0,9.0,2.0,5.0,2.0,2.0,2.0
5,2.0,3.0,9.0,2.0,13.0,2.0,2.0,2.0
6,1.0,4.0,5.0,1.0,12.0,2.0,2.0,2.0
7,1.0,4.0,9.0,1.0,6.0,2.0,1.0,2.0
8,2.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0
9,2.0,4.0,5.0,1.0,6.0,2.0,2.0,2.0


In [14]:
# Checking the datatypes for each column before the conversion (float64). Then, converting them into categorical variables
print(selectedColsDF.dtypes)
for eachCol in selectedColsDF:
    selectedColsDF[eachCol] = selectedColsDF[eachCol].astype('category')
print(selectedColsDF.dtypes)

_RFBMI5     float64
_EDUCAG     float64
_INCOMG     float64
EXERANY2    float64
_AGEG5YR    float64
SEX1        float64
DRNKANY5    float64
_DRDXAR1    float64
dtype: object
_RFBMI5     category
_EDUCAG     category
_INCOMG     category
EXERANY2    category
_AGEG5YR    category
SEX1        category
DRNKANY5    category
_DRDXAR1    category
dtype: object


In [10]:
selectedColsDF

Unnamed: 0,_RFBMI5,_EDUCAG,_INCOMG,EXERANY2,_AGEG5YR,SEX1,DRNKANY5,_DRDXAR1
0,1.0,4.0,4.0,2.0,13.0,2.0,2.0,1.0
1,2.0,4.0,2.0,1.0,3.0,2.0,1.0,2.0
2,2.0,2.0,2.0,1.0,12.0,2.0,2.0,2.0
3,2.0,2.0,2.0,1.0,10.0,1.0,2.0,2.0
4,9.0,4.0,9.0,2.0,5.0,2.0,2.0,2.0
5,2.0,3.0,9.0,2.0,13.0,2.0,2.0,2.0
6,1.0,4.0,5.0,1.0,12.0,2.0,2.0,2.0
7,1.0,4.0,9.0,1.0,6.0,2.0,1.0,2.0
8,2.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0
9,2.0,4.0,5.0,1.0,6.0,2.0,2.0,2.0


In [17]:
# Saving the dataframe into a csv file called "BRFSS2018.csv"
exportDF_csv = selectedColsDF.to_csv (r'/Users/riddaali/Documents/GitHub/GEOG5995M-Assignment2/BRFSS2018.csv', index = None, header=True) 



## Cleaning 