In [1]:
# Import required modules

import pandas as pd
import numpy as np
import yaml

In [2]:
# Import raw data

raw_df = pd.read_csv('nsmo_v50_1321_puf.csv')
raw_df.shape

(50542, 543)

In [3]:
# Load YAML files containing metadata into Python as dictionaries

# Load the variable labels
with open('variable_labels.yaml', 'r') as file:
    variable_labels_dict = yaml.safe_load(file)
    
# Load the the format of each variable into a dictionary
with open('variable_formats.yaml', 'r') as file:
    variable_formats_dict = yaml.safe_load(file)
    
# Load the categories for every categorical variable (exclude null categories)
with open('categorical_variables_categories.yaml', 'r') as file:
    categorical_variables_categories_dict = yaml.safe_load(file)

In [4]:
# Clean data by converting negative values and "." values (representing missing values) into null values

for col in raw_df.columns:
    # Exclude the Mortgage Performance Status variables because they have letters representing specific categories
    if variable_formats_dict[col] != 'PSTATFM':
        raw_df.loc[raw_df[col] < 0, col] = np.nan
        raw_df.loc[raw_df[col] == ".", col] = np.nan

In [5]:
# Check out a few obs after data cleaning

raw_df.tail()

Unnamed: 0,nsmoid,survey_wave,analysis_weight,x05a,x05b,x05c,x05d,x05e,x05f,x05g,...,mtmltv0621,mtmltv0921,mtmltv1221,mtmltv0322,mtmltv0622,mtmltv0922,mtmltv1222,mtmltv0323,mtmltv0623,mtmltv0923
50537,531289.0,34.0,2117.79,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,64.0,61.0,59.0,59.0,59.0,59.0,59.0,60.0
50538,546643.0,34.0,1738.92,3.0,3.0,2.0,2.0,2.0,1.0,3.0,...,,,79.0,77.0,74.0,72.0,72.0,71.0,71.0,71.0
50539,512993.0,34.0,2353.26,1.0,2.0,2.0,2.0,2.0,1.0,2.0,...,,,95.0,91.0,88.0,85.0,84.0,84.0,83.0,82.0
50540,518631.0,34.0,5283.75,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,,,56.0,53.0,50.0,49.0,49.0,49.0,48.0,48.0
50541,544740.0,34.0,1738.92,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,,,80.0,74.0,69.0,66.0,65.0,64.0,63.0,63.0


In [6]:
# Create a set of all variable formats

variable_formats_set = set()
for col in raw_df.columns:
    variable_formats_set.add(variable_formats_dict[col])
print(variable_formats_set)

{'PMARRYFM', 'PIDEAFM', 'PCOUNFM', 'PVERYFM', 'PDTIFM', 'PNORMFM', 'PBLANKFM', 'PMOFM', 'PLTVFM', 'PCSTATFM', 'PPURPFM', 'PIDFM', 'PACQFM', 'PINCFM', 'PTERMFM', 'PPTYPEFM', 'PPICKFM', 'PLENDFM', 'PYRFM', 'PCHANGEFM', 'PLAMTFM', 'PRESERVE', 'PEMPLOYFM', 'PSTATFM', 'PINTFM', 'PSAMEFM', 'PIMPFM', 'PPRICEFM', 'PSPRFM', 'POCCFM', 'PVETFM', 'PCONFM', 'PCASHOUT', 'PLTYPEFM', 'PWILLING', 'PPTIFM', 'PEDUCFM', 'PDESIREFM', 'PRACEFM', 'PMETLMIFM', 'PFORB', 'PWGTFM', 'PRISKFM', 'PLOTFM', 'PTIMEFM', 'PPROPFM', 'PWVFM', 'PAPPFM', 'PAGREEFM', 'PSEXFM', 'PBORRFM', 'PAGEFM', 'PVETXFM', 'PYNFM', 'PTIFM', 'PPMMSFM', 'PUSEDFM', 'PSCOREFM'}


In [7]:
# Create a list of the categorical variables and a list of the numeric variables

categorical_variables = []
numeric_variables = []

categorical_variable_formats = set(categorical_variables_categories_dict.keys())
numeric_variable_formats = variable_formats_set - categorical_variable_formats

for col in raw_df.columns:
    if variable_formats_dict[col] in categorical_variable_formats:
        categorical_variables.append(col)
    elif variable_formats_dict[col] in numeric_variable_formats:
        numeric_variables.append(col)
    else:
        print("Error in bifurcation")

In [8]:
# Check out a few obs for just the categorical variable

raw_df[categorical_variables].tail()

Unnamed: 0,survey_wave,x05a,x05b,x05c,x05d,x05e,x05f,x05g,x06,x07,...,forb0621,forb0921,forb1221,forb0322,forb0622,forb0922,forb1222,forb0323,forb0623,forb0923
50537,34.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50538,34.0,3.0,3.0,2.0,2.0,2.0,1.0,3.0,3.0,3.0,...,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0
50539,34.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50540,34.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,1.0,...,,,,,2.0,2.0,2.0,2.0,2.0,2.0
50541,34.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,...,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [9]:
# Check out a few obs for just the numeric variable

raw_df[numeric_variables].tail()

Unnamed: 0,nsmoid,analysis_weight,x74r,rate_spread,pmms,term,ltv,cltv,dti,pti,...,mtmltv0621,mtmltv0921,mtmltv1221,mtmltv0322,mtmltv0622,mtmltv0922,mtmltv1222,mtmltv0323,mtmltv0623,mtmltv0923
50537,531289.0,2117.79,57.0,0.64,3.11,40.0,64.0,64.0,42.0,13.0,...,,,64.0,61.0,59.0,59.0,59.0,59.0,59.0,60.0
50538,546643.0,1738.92,37.0,0.03,3.1,30.0,79.0,79.0,33.0,15.0,...,,,79.0,77.0,74.0,72.0,72.0,71.0,71.0,71.0
50539,512993.0,2353.26,26.0,,3.1,30.0,95.0,95.0,35.0,15.0,...,,,95.0,91.0,88.0,85.0,84.0,84.0,83.0,82.0
50540,518631.0,5283.75,36.0,,3.1,20.0,56.0,56.0,46.0,23.0,...,,,56.0,53.0,50.0,49.0,49.0,49.0,48.0,48.0
50541,544740.0,1738.92,42.0,0.08,3.05,30.0,80.0,80.0,20.0,19.0,...,,,80.0,74.0,69.0,66.0,65.0,64.0,63.0,63.0


In [10]:
# View survey answers for any given observation in a human readable format using the YAML metadata

one_obs = raw_df.iloc[50541]
for col, value in one_obs.items():
    # if categorical value
    if not(pd.isna(value)) and variable_formats_dict[col] in categorical_variables_categories_dict.keys():
        print(variable_labels_dict[col], ":", categorical_variables_categories_dict[variable_formats_dict[col]][value])
    # else numeric value or null
    else:
        print(variable_labels_dict[col], ":", value)

NSMO Identification Number : 544740.0
NSMO Survey Wave (Quarterly) : 2022 Q2
NSMO Analysis Weight (Sampling Weight x Non-response Adjustment) : 1738.92
When you began the process of getting this mortgage, how familiar were you (and any co-signers) with each ofthe following? | The mortgage interest rates available at that time : Very
When you began the process of getting this mortgage, how familiar were you (and any co-signers) with each ofthe following? | The different types of mortgages available : Very
When you began the process of getting this mortgage, how familiar were you (and any co-signers) with each ofthe following? | The mortgage process : Very
When you began the process of getting this mortgage, how familiar were you (and any co-signers) with each ofthe following? | The down payment needed to qualify for amortgage : Very
When you began the process of getting this mortgage, how familiar were you (and any co-signers) with each ofthe following? | The income needed to qualify fo

In [None]:
lookat = pd.get_dummies(raw_df, columns=['x05a'], prefix='is')

In [None]:
lookat.head()