In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv('tratamiento_datos.csv', index_col=0)
df

Unnamed: 0,Edad,Género,Ingresos,Altura,Ciudad,Nivel_Educación,Hijos
0,82,F,62297,1.96,Phoenix,,0
1,15,F,38674,1.83,New York,PhD,4
2,166,,-1886,1.87,,Bachelors,-5
3,95,M,29759,1.77,Chicago,PhD,4
4,36,M,99938,1.78,Phoenix,PhD,5
...,...,...,...,...,...,...,...
99995,65,M,62403,1.95,Houston,,1
99996,74,F,29457,1.81,New York,Master,3
99997,27,M,48147,1.88,Houston,Master,5
99998,39,M,92826,1.82,New York,,2


In [6]:
#creacion de funciones
def remove_negative_values(df,column):
    df[column] = df[column].apply(lambda x:np.nan if x < 0 else x)
    return df

def remove_outlier_values_with_zscore(df,column, threshold = 2):
    column_mean = df[column].mean()
    column_std = df[column].std()
    df[column] = df[column].mask(((df[column]-column_mean)/column_std).abs() > threshold, column_mean)
    return df

def map_column_values(df, column, mapping_dict):
    df[column] = df[column].apply(lambda value: mapping_dict.get(value, value))
    return df

def fill_nan_column(df, column, fill_value):
    df[column].fillna(fill_value, inplace=True)
    return df

def preprocess_data(df):
    education_mapping= {
        'Bachelors':'Bachelor',
        'mastre': 'Master',
        'pHd':'PhD',
        'no education': 'None'
    }

    gender_mapping = {
        'm': 'M',
        'f': 'F'
    }
    return(
        df.pipe(remove_negative_values, 'Edad')
        .pipe(remove_negative_values, 'Ingresos')
        .pipe(remove_negative_values, 'Hijos')
        .pipe(remove_outlier_values_with_zscore, 'Edad')
        .pipe(remove_outlier_values_with_zscore, 'Ingresos')
        .pipe(remove_outlier_values_with_zscore, 'Hijos')
        .pipe(remove_outlier_values_with_zscore, 'Altura')
        .pipe(map_column_values, 'Nivel_Educación', education_mapping)
        .pipe(map_column_values, 'Género', gender_mapping)
        .pipe(fill_nan_column, 'Ciudad', 'Desconocido')
        .pipe(fill_nan_column, 'Nivel_Educación', 'Desconocido')
        .pipe(fill_nan_column, 'Género', 'Desconocido')
        .pipe(fill_nan_column, 'Edad', df['Edad'].median())
        .pipe(fill_nan_column, 'Hijos', df['Hijos'].median())
        .pipe(fill_nan_column, 'Ingresos', df['Ingresos'].mean())
        .pipe(fill_nan_column, 'Altura', df['Altura'].mean())
    )

In [7]:
#Ejecucion de las funciones para tratar la base de datos
pd.options.mode.chained_assignment = None
df = pd.read_csv('tratamiento_datos.csv', index_col=0)
df= preprocess_data(df)
print(df)
df['Nivel_Educación'].value_counts()


            Edad       Género      Ingresos  Altura       Ciudad  \
0      82.000000            F  62297.000000    1.96      Phoenix   
1      15.000000            F  38674.000000    1.83     New York   
2      57.088505  Desconocido  60239.987315    1.87  Desconocido   
3      95.000000            M  29759.000000    1.77      Chicago   
4      36.000000            M  99938.000000    1.78      Phoenix   
...          ...          ...           ...     ...          ...   
99995  65.000000            M  62403.000000    1.95      Houston   
99996  74.000000            F  29457.000000    1.81     New York   
99997  27.000000            M  48147.000000    1.88      Houston   
99998  39.000000            M  92826.000000    1.82     New York   
99999  62.000000            M  74762.000000    1.90      Houston   

      Nivel_Educación  Hijos  
0         Desconocido    0.0  
1                 PhD    4.0  
2            Bachelor    2.0  
3                 PhD    4.0  
4                 PhD    5.0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(fill_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Nivel_Educación
Master         25041
PhD            25016
Bachelor       24961
Desconocido    22437
None            2545
Name: count, dtype: int64

In [9]:
#exportar base depurada
df.to_csv('Base de datos depurada.csv')