# Importar Librarias y datos

In [175]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np   
import itertools

from scipy.stats import chi2_contingency, ttest_ind
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf 

In [176]:
df = pd.read_parquet("output_02.parquet") # no es 03 porque visualisar no ha afectado

In [177]:
df.head()

Unnamed: 0,age,gender,education,marital_status,race,is_hispanic,employment_commitment,employment_stat,wage_per_hour,working_week_per_year,...,tax_status,gains,losses,stocks_status,citizenship,migration_year,country_of_birth_own,migration_code_change_in_msa,migration_code_change_in_reg,income_above_limit
0,79,Female,High school graduate,Widowed,White,All other,Not in labor force,0,0,52,...,Head of household,0,0,292,Native,95,US,?,?,Below limit
1,65,Female,High school graduate,Widowed,White,All other,Children or Armed Forces,0,0,0,...,Single,0,0,0,Native,94,US,unchanged,unchanged,Below limit
2,21,Male,12th grade no diploma,Never married,Black,All other,Children or Armed Forces,0,500,15,...,Single,0,0,0,Native,94,US,unchanged,unchanged,Below limit
3,2,Female,Children,Never married,Asian or Pacific Islander,All other,Children or Armed Forces,0,0,0,...,Nonfiler,0,0,0,Native,94,US,unchanged,unchanged,Below limit
4,70,Male,High school graduate,Married-civilian spouse present,White,All other,Not in labor force,0,0,0,...,Joint both 65+,0,0,0,Native,95,US,?,?,Below limit


# Modificar datos a binario

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209499 entries, 0 to 209498
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   age                           209499 non-null  int64 
 1   gender                        209499 non-null  object
 2   education                     209499 non-null  object
 3   marital_status                209499 non-null  object
 4   race                          209499 non-null  object
 5   is_hispanic                   209499 non-null  object
 6   employment_commitment         209499 non-null  object
 7   employment_stat               209499 non-null  int64 
 8   wage_per_hour                 209499 non-null  int64 
 9   working_week_per_year         209499 non-null  int64 
 10  industry_code_main            209499 non-null  object
 11  occupation_code               209499 non-null  int64 
 12  total_employed                209499 non-null  int64 
 13 

In [180]:
df['country_of_birth_own'].unique()

array(['US', '?', 'El-Salvador', 'Mexico', 'Philippines', 'Cambodia',
       'China', 'Hungary', 'Puerto-Rico', 'England', 'Dominican-Republic',
       'Japan', 'Canada', 'Ecuador', 'Italy', 'Cuba', 'Peru', 'Taiwan',
       'South Korea', 'Poland', 'Nicaragua', 'Germany', 'Guatemala',
       'India', 'Ireland', 'Honduras', 'France', 'Trinadad&Tobago',
       'Thailand', 'Iran', 'Vietnam', 'Portugal', 'Laos', 'Panama',
       'Scotland', 'Columbia', 'Jamaica', 'Greece', 'Haiti', 'Yugoslavia',
       'Outlying-U S (Guam USVI etc)', 'Holand-Netherlands', 'Hong Kong'],
      dtype=object)

In [181]:
# change US to US & Other
df.loc[df['country_of_birth_own'] != 'US', 'country_of_birth_own'] = 'Other'
df['country_of_birth_own'].unique()

array(['US', 'Other'], dtype=object)

In [182]:
df['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [183]:
df['gender'] = df['gender'].replace({'Male':0, 'Female':1})
df['gender'].unique()

  df['gender'] = df['gender'].replace({'Male':0, 'Female':1})


array([1, 0], dtype=int64)

In [184]:
df['income_above_limit'] = df['income_above_limit'].replace({'Above limit':1, 'Below limit':0})
df['income_above_limit'].unique()

  df['income_above_limit'] = df['income_above_limit'].replace({'Above limit':1, 'Below limit':0})


array([0, 1], dtype=int64)

# limpieza de valores categoricas 

Reducir los valores en los datos categóricos para eliminar la repetición y facilitar la manipulación de datos.

In [186]:
df['household_stat'].unique()

array(['Householder', 'Nonfamily householder',
       'Child 18+ never marr Not in a subfamily',
       'Child <18 never marr not in subfamily', 'Spouse of householder',
       'Child 18+ spouse of subfamily RP', 'Secondary individual',
       'Child 18+ never marr RP of subfamily',
       'Other Rel 18+ spouse of subfamily RP',
       'Grandchild <18 never marr not in subfamily',
       'Other Rel <18 never marr child of subfamily RP',
       'Other Rel 18+ ever marr RP of subfamily',
       'Other Rel 18+ ever marr not in subfamily',
       'Child 18+ ever marr Not in a subfamily',
       'RP of unrelated subfamily', 'Child 18+ ever marr RP of subfamily',
       'Other Rel 18+ never marr not in subfamily',
       'Child under 18 of RP of unrel subfamily',
       'Grandchild <18 never marr child of subfamily RP',
       'Grandchild 18+ never marr not in subfamily',
       'Other Rel <18 never marr not in subfamily', 'In group quarters',
       'Grandchild 18+ ever marr not in subfamil

In [187]:
df['education'].unique()

array(['High school graduate', '12th grade no diploma', 'Children',
       'Bachelors degree(BA AB BS)', '7th and 8th grade', '11th grade',
       '9th grade', 'Masters degree(MA MS MEng MEd MSW MBA)',
       '10th grade', 'Associates degree-academic program',
       '1st 2nd 3rd or 4th grade', 'Some college but no degree',
       'Less than 1st grade', 'Associates degree-occup /vocational',
       'Prof school degree (MD DDS DVM LLB JD)', '5th or 6th grade',
       'Doctorate degree(PhD EdD)'], dtype=object)

In [188]:
df_copia = df.copy()

mapping = {
    'High school graduate': 'secondary_complete', 
    '12th grade no diploma': 'secondary_incomplete', 
    'Children': 'child',
    'Bachelors degree(BA AB BS)': 'university_graduate_or_higher', 
    '7th and 8th grade': 'secondary_incomplete', 
    '11th grade': 'secondary_incomplete',
    '9th grade':'secondary_incomplete', 
    'Masters degree(MA MS MEng MEd MSW MBA)': 'university_graduate_or_higher',
    '10th grade': 'secondary_incomplete', 
    'Associates degree-academic program': 'technical_education/university_incomplete',
    '1st 2nd 3rd or 4th grade': 'primary', 
    'Some college but no degree': 'technical_education/university_incomplete',
    'Less than 1st grade': 'primary', 
    'Associates degree-occup /vocational': 'technical_education/university_incomplete',
    'Prof school degree (MD DDS DVM LLB JD)':'university_graduate_or_higher', 
    '5th or 6th grade': 'primary',
}

df_copia['education'] = df_copia['education'].replace(mapping)

In [189]:
df_copia['education'].unique()

array(['secondary_complete', 'secondary_incomplete', 'child',
       'university_graduate_or_higher',
       'technical_education/university_incomplete', 'primary',
       'Doctorate degree(PhD EdD)'], dtype=object)

In [190]:
mapping = {
    'Householder': 'Householder',
    'Nonfamily householder': 'Nonfamily Householder',
    'Spouse of householder': 'Spouse',
    'Child 18+ never marr Not in a subfamily': 'Child',
    'Child <18 never marr not in subfamily': 'Child',
    'Other Rel 18+ spouse of subfamily RP': 'Other Relative',
    'In group quarters': 'In Group Quarters',
    'Child 18+ spouse of subfamily RP': 'Child', 
    'Secondary individual': 'Other',
    'Child 18+ never marr RP of subfamily': 'Child',
    'Other Rel 18+ spouse of subfamily RP': 'Other',
    'Grandchild <18 never marr not in subfamily': 'Grandchild',
    'Other Rel <18 never marr child of subfamily RP': 'Other',
    'Other Rel 18+ ever marr RP of subfamily': 'Other',
    'Other Rel 18+ ever marr not in subfamily': 'Other',
    'Child 18+ ever marr Not in a subfamily': 'Child',
    'RP of unrelated subfamily': 'Other',
    'Child 18+ ever marr RP of subfamily': 'Child',
    'Other Rel 18+ never marr not in subfamily': 'Other',
    'Child under 18 of RP of unrel subfamily': 'Child',
    'Grandchild <18 never marr child of subfamily RP': 'Grandchild',
    'Grandchild 18+ never marr not in subfamily': 'Grandchild',
    'Other Rel <18 never marr not in subfamily': 'Other', 
    'In group quarters': 'Group Quarters',
    'Grandchild 18+ ever marr not in subfamily': 'Grandchild',
    'Other Rel 18+ never marr RP of subfamily': 'Other',
    'Child <18 never marr RP of subfamily': 'Child',
    'Grandchild 18+ never marr RP of subfamily': 'Grandchild',
    'Spouse of RP of unrelated subfamily': 'Spouse',
    'Grandchild 18+ ever marr RP of subfamily': 'Grandchild',
    'Child <18 ever marr not in subfamily': 'Child',
    'Child <18 ever marr RP of subfamily': 'Child',
    'Other Rel <18 ever marr RP of subfamily': 'Other',
    'Grandchild 18+ spouse of subfamily RP': 'Grandchild',
    'Child <18 spouse of subfamily RP': 'Child',
    'Other Rel <18 ever marr not in subfamily': 'Other',
    'Other Rel <18 never married RP of subfamily': 'Other',
    'Other Rel <18 spouse of subfamily RP': 'Other',
    'Grandchild <18 ever marr not in subfamily': 'Grandchild',
    'Grandchild <18 never marr RP of subfamily': 'Grandchild'
}
df_copia['household_stat'] = df_copia['household_stat'].map(mapping).fillna('Other')

In [191]:
df_copia['household_stat'].unique()

array(['Householder', 'Nonfamily Householder', 'Child', 'Spouse', 'Other',
       'Grandchild', 'Group Quarters'], dtype=object)

# Guardar output

In [193]:
df_copia.to_parquet("output_04.parquet")