# Data Cleaning Practice  

Goals
- Practice data cleaning with Python
- Seek concise code
- Clearly document decisions
- Be thorough 

Dataset:
- Alzheimers disease and healthy aging data
- Source: https://catalog.data.gov/dataset/alzheimers-disease-and-healthy-aging-data

In [20]:
### Packages
import numpy as np
# import pandas as pd
import modin.pandas as pd # faster
import os
from janitor import clean_names
os.chdir('C:/Users/WulfN/')

### Read in Data
alz_messy = (clean_names(
    pd.read_csv('./datasets/unclean_data_practice/Alzheimer_s_Disease_and_Healthy_Aging_Data.csv'))
    # .rename(columns={'adjusted_gross_in_2022_dollars_': 'adjusted_gross_in_2022_dollars', 
    #                  'year_s_': 'year_range'})
                     )

# remove scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

# multitple outputs per cell
%config InteractiveShell.ast_node_interactivity = "all"

alz_messy

Unnamed: 0,rowid,yearstart,yearend,locationabbr,locationdesc,datasource,class,topic,question,data_value_unit,...,stratification2,geolocation,classid,topicid,questionid,locationid,stratificationcategoryid1,stratificationid1,stratificationcategoryid2,stratificationid2
0,BRFSS~2022~2022~42~Q03~TMC01~AGE~RACE,2022,2022,PA,Pennsylvania,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Native Am/Alaskan Native,POINT (-77.86070029 40.79373015),C05,TMC01,Q03,42,AGE,5064,RACE,NAA
1,BRFSS~2022~2022~46~Q03~TMC01~AGE~RACE,2022,2022,SD,South Dakota,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Asian/Pacific Islander,POINT (-100.3735306 44.35313005),C05,TMC01,Q03,46,AGE,65PLUS,RACE,ASN
2,BRFSS~2022~2022~16~Q03~TMC01~AGE~RACE,2022,2022,ID,Idaho,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,"Black, non-Hispanic",POINT (-114.36373 43.68263001),C05,TMC01,Q03,16,AGE,65PLUS,RACE,BLK
3,BRFSS~2022~2022~24~Q03~TMC01~AGE~RACE,2022,2022,MD,Maryland,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,"Black, non-Hispanic",POINT (-76.60926011 39.29058096),C05,TMC01,Q03,24,AGE,65PLUS,RACE,BLK
4,BRFSS~2022~2022~55~Q03~TMC01~AGE~GENDER,2022,2022,WI,Wisconsin,BRFSS,Mental Health,Frequent mental distress,Percentage of older adults who are experiencin...,%,...,Male,POINT (-89.81637074 44.39319117),C05,TMC01,Q03,55,AGE,65PLUS,GENDER,MALE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284137,BRFSS~2016~2016~55~Q15~TSC02~AGE~RACE,2016,2016,WI,Wisconsin,BRFSS,Screenings and Vaccines,Colorectal cancer screening,Percentage of older adults who had either a ho...,%,...,"Black, non-Hispanic",POINT (-89.81637074 44.39319117),C03,TSC02,Q15,55,AGE,AGE_OVERALL,RACE,BLK
284138,BRFSS~2017~2017~56~Q45~TOC13~AGE~RACE,2017,2017,WY,Wyoming,BRFSS,Overall Health,Fair or poor health among older adults with ar...,Fair or poor health among older adults with do...,%,...,Hispanic,POINT (-108.1098304 43.23554134),C01,TOC13,Q45,56,AGE,5064,RACE,HIS
284139,BRFSS~2015~2015~56~Q42~TCC04~AGE~RACE,2015,2015,WY,Wyoming,BRFSS,Cognitive Decline,Talked with health care professional about sub...,Percentage of older adults with subjective cog...,%,...,Asian/Pacific Islander,POINT (-108.1098304 43.23554134),C06,TCC04,Q42,56,AGE,AGE_OVERALL,RACE,ASN
284140,BRFSS~2019~2019~54~Q46~TOC10~AGE~RACE,2019,2019,WV,West Virginia,BRFSS,Overall Health,"Disability status, including sensory or mobili...",Percentage of older adults who report having a...,%,...,Hispanic,POINT (-80.71264013 38.6655102),C01,TOC10,Q46,54,AGE,65PLUS,RACE,HIS


In [2]:
alz_messy.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 284142 entries, 0 to 284141
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   rowid                       284142 non-null  object 
 1   yearstart                   284142 non-null  int64  
 2   yearend                     284142 non-null  int64  
 3   locationabbr                284142 non-null  object 
 4   locationdesc                284142 non-null  object 
 5   datasource                  284142 non-null  object 
 6   class                       284142 non-null  object 
 7   topic                       284142 non-null  object 
 8   question                    284142 non-null  object 
 9   data_value_unit             284142 non-null  object 
 10  datavaluetypeid             284142 non-null  object 
 11  data_value_type             284142 non-null  object 
 12  data_value                  192808 non-null  float64
 13  data_valu

### Currently unable to view data in data viewer. Will be using other methods to view the data

After viewing the data (in RStudio, for now), here are some proposed changes.
- Does a data dictionary exist for this data?
- View categorys of the stratified columns
- State or region indicator for the locationdesc column
- Change data value unit '%' to 'percentage'
- What is the data value alt?
- distinct data footnotes, can many of these be removed?
- how are the low and high confidence variables derived? 
- clean geolocation column, seperate latitude and longitude. 
- class id, question id, location id - what are these? 
- range of year start and year end?
- What do to with NAs for the data values?

In [59]:
# create lists by variable of the distinct values in the stratificiation columns

stratif_cols = [x for x in list(alz_messy.columns)  if x.startswith('strati')]
other_cat_cols = ['question', 'topic', 'class']
cat_cols = stratif_cols + other_cat_cols

cat_vars_df = alz_messy[cat_cols]
# Get qty of distinct values for each column in the lists
all_cats = dict({c: cat_vars_df[c].unique() for c in cat_vars})

In [72]:
# dictionary of all_cats 

{'stratificationcategory1': array(['Age Group'], dtype=object),
 'stratification1': array(['50-64 years', '65 years or older', 'Overall'], dtype=object),
 'stratificationcategory2': array(['Race/Ethnicity', 'Gender', nan], dtype=object),
 'stratification2': array(['Native Am/Alaskan Native', 'Asian/Pacific Islander',
        'Black, non-Hispanic', 'Male', 'White, non-Hispanic', 'Hispanic',
        nan, 'Female'], dtype=object),
 'stratificationcategoryid1': array(['AGE'], dtype=object),
 'stratificationid1': array(['5064', '65PLUS', 'AGE_OVERALL'], dtype=object),
 'stratificationcategoryid2': array(['RACE', 'GENDER', 'OVERALL'], dtype=object),
 'stratificationid2': array(['NAA', 'ASN', 'BLK', 'MALE', 'WHT', 'HIS', 'OVERALL', 'FEMALE'],
       dtype=object),
 'question': array(['Percentage of older adults who are experiencing frequent mental distress',
        'Mean number of days with activity limitations in the past month',
        'Percentage of older adults currently not providing c

- Practice with class and methods 
    - Write general code
- Creating functions to retrieve categories in a dataframe

In [175]:
# May add documentation later

class unique_categories: 

    def __init__(self, dataframe, category_list): 
        self.dataframe = dataframe
        self.category_list = category_list

    def category_dict(self): 
        # Subset dataframe to to columns with categories 
        category_df = self.dataframe[self.category_list]
        # create dictionary of column names and unique categories
        messy_category_dict = dict({c: category_df[c].unique() for c in category_df})
        
        get_categories = {}

        for key, values in messy_category_dict.items():
            get_categories[key] = values.tolist()

        return get_categories

    def get_num_categories(self): 
        category_dict = self.category_dict()

        # get number of categories
        num_categories = []

        for key in category_dict.keys():
            num_categories = num_categories + [len(category_dict[key])]

        return num_categories
    
    def category_lists(self): # categories_for_df?
        num_categories = self.get_num_categories()
        category_dict = self.category_dict()

        add_na = []

        for item in num_categories:
            sum_na = [max(num_categories) - item]
            add_na = add_na + sum_na

        # 
        # Add NAs to the categories so each list is the same size

        return add_na

    def category_df(self):
        category_df = pd.DataFrame({
            'var_name': self.category_dict().keys(),
            'num_categories': self.get_num_categories()
            #'categories': get_categories.values()
        })

        return category_df 
    
    # Replace NANs with NAs in the list? must think of case where there are not nans for the list

In [189]:
object = unique_categories(alz_messy, cat_cols)

category_dict = object.category_dict()
num_categories = object.get_num_categories()

for i, key in enumerate(category_dict): 
    # enumerate gives a count, in this case, for the key value, so that key1 has count 1, key2 has count 2, etc. 
    # how does this kelp?
    # Append 'NA' to each list based on the corresponding count in na_count_list
    category_dict[key].extend(['NA'] * num_categories[i]) 

category_dict


{'stratificationcategory1': ['Age Group', 'NA'],
 'stratification1': ['50-64 years',
  '65 years or older',
  'Overall',
  'NA',
  'NA',
  'NA'],
 'stratificationcategory2': ['Race/Ethnicity',
  'Gender',
  nan,
  'NA',
  'NA',
  'NA'],
 'stratification2': ['Native Am/Alaskan Native',
  'Asian/Pacific Islander',
  'Black, non-Hispanic',
  'Male',
  'White, non-Hispanic',
  'Hispanic',
  nan,
  'Female',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA'],
 'stratificationcategoryid1': ['AGE', 'NA'],
 'stratificationid1': ['5064', '65PLUS', 'AGE_OVERALL', 'NA', 'NA', 'NA'],
 'stratificationcategoryid2': ['RACE', 'GENDER', 'OVERALL', 'NA', 'NA', 'NA'],
 'stratificationid2': ['NAA',
  'ASN',
  'BLK',
  'MALE',
  'WHT',
  'HIS',
  'OVERALL',
  'FEMALE',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA',
  'NA'],
 'question': ['Percentage of older adults who are experiencing frequent mental distress',
  'Mean number of days with activity limitations in the past month',
  'P

In [None]:
# Create dataframe with categories as one column, and number of cats in another

# OR, have the category as the column of a dataframe 
# where the length is the length of the number of categories
# each column has all of the categories for the column. 

test = max(len(all_cats[keys]))

category_df = pd.DataFrame({})

for keys, values in all_cats.items():
    var_names_and_cats = pd.DataFrame({
        'col_names': keys, 
        'categories': values,
        'num_categories': len(all_cats[keys])
    })
    
    
    category_df = pd.concat(var_names_and_cats, axis=0) # pretty sure this is rowwise

category_df   

    
# try a dictionary
#print('key_name: ', all_cats[keys], '# categories: ', len(all_cats[keys]))

# Can you combine a bunch of series? The keys are the column names, and the values are the 
# distinct categories. Need function to fill the rest with NAs
# fn is (length of max list) - (length of list) = [num_na] to add to [list]


In [None]:
# Replace NANs with NAs