In [1]:
import os 
import glob as glob 
import numpy as np
import pandas as pd

In [2]:
dir_path = r"C:\Users\jinyo\Downloads\OneDrive_1_11-7-2024"

files = sorted(glob.glob(os.path.join(dir_path, "*.csv")))
files

['C:\\Users\\jinyo\\Downloads\\OneDrive_1_11-7-2024\\feature_description.csv',
 'C:\\Users\\jinyo\\Downloads\\OneDrive_1_11-7-2024\\train_features.csv',
 'C:\\Users\\jinyo\\Downloads\\OneDrive_1_11-7-2024\\train_labels.csv']

In [3]:
X_df = pd.read_csv(files[1])
y_df = pd.read_csv(files[2])
description_df = pd.read_csv(files[0], encoding='unicode_escape', header=None)

In [4]:
description_df = description_df.drop([0])

In [5]:
description_df

Unnamed: 0,0,1
1,age_03 / age_12,Binned age group
2,urban_03 / urban_12,Locality size
3,married_03 / married_12,Marital status
4,n_mar_03 / n_mar_12,Number of marriages
5,edu_gru_03 / edu_gru_12,Binned education level
...,...,...
104,a21_12,Total years lived or worked in the U.S.
105,a22_12,Main job type during longest stay in the U.S.
106,a33b_12,U.S. residency status
107,a34_12,Speaks English


In [6]:
print(len(X_df))
print(len(y_df))

3276
4343


In [7]:
X_df = X_df.sort_values(by="uid")
y_df = y_df.sort_values(by="uid")

In [8]:
testing_df = X_df.drop(columns=["uid"])

In [9]:
for column in testing_df.columns:
    if testing_df[column].dtype == "object":
        print(column)
        print(set(testing_df[column].values))
        print()

age_03
{nan, '2. 60–69', '3. 70–79', '1. 50–59', '4. 80+', '0. 49 or younger'}

urban_03
{'0. <100,000', '1. 100,000+', nan}

married_03
{'2. Separated or divorced', nan, '3. Widowed', '4. Single', '1. Married or in civil union'}

edu_gru_03
{'0. No education', '1. 1–5 years', nan, '2. 6 years', '3. 7–9 years', '4. 10+ years'}

n_living_child_03
{'1. 1 or 2', nan, '4. 7+', '3. 5 or 6', '0. No children', '2. 3 or 4'}

glob_hlth_03
{'2. Very good', '3. Good', '1. Excellent', nan, '5. Poor', '4. Fair'}

bmi_03
{nan, '2. Normal weight', '1. Underweight', '3. Overweight', '4. Obese', '5. Morbidly obese'}

decis_famil_03
{'2. Approximately equal weight', '1. Respondent', nan, '3. Spouse'}

employment_03
{'2. Currently looking for work', nan, '3. Dedicated to household chores', '4. Retired, incapacitated, or does not work', '1. Currently Working'}

age_12
{nan, '2. 60–69', '3. 70–79', '1. 50–59', '4. 80+', '0. 49 or younger'}

urban_12
{'1. 100,000+', '0. <100,000', nan}

married_12
{'2. Sepa

In [10]:
mapping = {}

for idx in range(len(description_df)):
    description = description_df.iloc[idx,0]
    data = description_df.iloc[idx,1]
    
    description = description.strip()
    data = data.strip()
    
    if "/" in description:
        descriptions = description.split("/")
        for desc in descriptions:
            desc = desc.strip() 
            mapping[desc] = data 
    else:
        mapping[description] = data

In [11]:
mapping

{'age_03': 'Binned age group',
 'age_12': 'Binned age group',
 'urban_03': 'Locality size',
 'urban_12': 'Locality size',
 'married_03': 'Marital status',
 'married_12': 'Marital status',
 'n_mar_03': 'Number of marriages',
 'n_mar_12': 'Number of marriages',
 'edu_gru_03': 'Binned education level',
 'edu_gru_12': 'Binned education level',
 'n_living_child_03': 'Binned number of living children',
 'n_living_child_12': 'Binned number of living children',
 'migration_03': 'Has lived or worked in the U.S.',
 'migration_12': 'Has lived or worked in the U.S.',
 'glob_hlth_03': 'Self-reported global health',
 'glob_hlth_12': 'Self-reported global health',
 'adl_dress_03': 'Has difficulty getting dressed',
 'adl_dress_12': 'Has difficulty getting dressed',
 'adl_walk_03': 'Has difficulty walking from one side of the room to the other',
 'adl_walk_12': 'Has difficulty walking from one side of the room to the other',
 'adl_bath_03': 'Has difficulty bathing themselves in a tub or shower',
 'adl_

In [12]:
for idx in range(len(testing_df)):
    sentences = []
    for column in testing_df.columns:
        if testing_df[column].dtype == "object":
            description = mapping[column]
            info = testing_df.loc[idx,column]
            sentences.append("{} is {}.".format(description, info))
    break

In [13]:
sentences

['Binned age group is nan.',
 'Locality size is nan.',
 'Marital status is nan.',
 'Binned education level is nan.',
 'Binned number of living children is nan.',
 'Self-reported global health is nan.',
 'Binned body mass index is nan.',
 'Weight in family decisions is nan.',
 'Employment status is nan.',
 'Binned age group is 2. 60–69.',
 'Locality size is 0. <100,000.',
 'Marital status is 1. Married or in civil union.',
 'Binned education level is 0. No education.',
 'Binned number of living children is 1. 1 or 2.',
 'Self-reported global health is 4. Fair.',
 'Binned body mass index is 3. Overweight.',
 'Weight in family decisions is 2. Approximately equal weight.',
 'Weight over personal decisions is 1. A lot.',
 'Employment status is 1. Currently Working.',
 'How much they agree with the statement that their life is close to ideal is 3. Disagrees.',
 'How much they agree with the statement that life is excellent is 3. Disagrees.',
 'How much they agree with the statement that they

In [14]:
def processing(data):
    if pd.isna(data):
        return "Not Available"
    else:
        data_str = str(data)
        if "." in data_str:
            return str(data_str.split(".")[1])
        else:
            return str(''.join([i for i in data_str if not i.isdigit()]))

In [15]:
for idx in range(len(testing_df)):
    sentences = []
    for column in testing_df.columns:
        if testing_df[column].dtype == "object":
            description = mapping[column]
            info = testing_df.loc[idx,column]
            processed_info = processing(info)
            sentences.append("{} is {}.".format(description, processed_info))
    break

In [16]:
sentences

['Binned age group is Not Available.',
 'Locality size is Not Available.',
 'Marital status is Not Available.',
 'Binned education level is Not Available.',
 'Binned number of living children is Not Available.',
 'Self-reported global health is Not Available.',
 'Binned body mass index is Not Available.',
 'Weight in family decisions is Not Available.',
 'Employment status is Not Available.',
 'Binned age group is  60–69.',
 'Locality size is  <100,000.',
 'Marital status is  Married or in civil union.',
 'Binned education level is  No education.',
 'Binned number of living children is  1 or 2.',
 'Self-reported global health is  Fair.',
 'Binned body mass index is  Overweight.',
 'Weight in family decisions is  Approximately equal weight.',
 'Weight over personal decisions is  A lot.',
 'Employment status is  Currently Working.',
 'How much they agree with the statement that their life is close to ideal is  Disagrees.',
 'How much they agree with the statement that life is excellent i