In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import HuberRegressor

In [29]:
data = {
    'ID': range(1, 21),
    'Nombre': [None, 'Maria', 'Pedro', 'Ana', 'Luis', 'Carlos', 'Maria', 'Pedro', 'Ana', 'Luis', 'Carlos', 'Maria', 'Pedro', 'Ana', 'Luis', 'Carlos', 'Maria', 'Pedro', 'Ana', 'Luis'],
    'Edad': [20, 21, 22, 23, 25, 40, 26, 27, 28, 29, 45, 31, 47, 33, 34, 48, 36, 37, 38, 39],
    'Salario': [50000, 1000, 500, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 15000, 17000, 18000, 28000, 60000],
    'Ciudad': ['Madrid', 'Barcelona', None, 'Sevilla', 'Malaga', 'Cordoba', 'Madrid', 'Barcelona', 'Valencia', 'Sevilla', 'Malaga', 'Cordoba', 'Madrid', 'Barcelona', 'Valencia', 'Sevilla', 'Malaga', 'Cordoba', 'Madrid', 'Barcelona'],
    'Fecha_Ingreso': ['2020-01-01', None, '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08', '2020-01-09', None, '2020-01-11', '2020-01-12', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20']
}
df = pd.DataFrame(data)
display(df)


Unnamed: 0,ID,Nombre,Edad,Salario,Ciudad,Fecha_Ingreso
0,1,,20,50000,Madrid,2020-01-01
1,2,Maria,21,1000,Barcelona,
2,3,Pedro,22,500,,2020-01-03
3,4,Ana,23,4000,Sevilla,2020-01-04
4,5,Luis,25,5000,Malaga,2020-01-05
5,6,Carlos,40,6000,Cordoba,2020-01-06
6,7,Maria,26,7000,Madrid,2020-01-07
7,8,Pedro,27,8000,Barcelona,2020-01-08
8,9,Ana,28,9000,Valencia,2020-01-09
9,10,Luis,29,10000,Sevilla,


In [30]:
#Utilizando con los cuartiles IQR
Q1 = df['Salario'].quantile(0.25)
Q3= df['Salario'].quantile(0.75)
#Calculo el rango entrecuartilico
IQR = Q3 - Q1
#Ahora calcular los limites
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#Identificar lo outliers
outliers = df[(df['Salario'] < lower_bound) | (df['Salario'] > upper_bound)]
display(outliers)


Unnamed: 0,ID,Nombre,Edad,Salario,Ciudad,Fecha_Ingreso
0,1,,20,50000,Madrid,2020-01-01
19,20,Luis,39,60000,Barcelona,2020-01-20


In [31]:
#Primera estrategia eliminar los outlaiers
df_cleaned = df[(df['Salario'] >= lower_bound) & (df['Salario'] <= upper_bound)]
display(df_cleaned)

Unnamed: 0,ID,Nombre,Edad,Salario,Ciudad,Fecha_Ingreso
1,2,Maria,21,1000,Barcelona,
2,3,Pedro,22,500,,2020-01-03
3,4,Ana,23,4000,Sevilla,2020-01-04
4,5,Luis,25,5000,Malaga,2020-01-05
5,6,Carlos,40,6000,Cordoba,2020-01-06
6,7,Maria,26,7000,Madrid,2020-01-07
7,8,Pedro,27,8000,Barcelona,2020-01-08
8,9,Ana,28,9000,Valencia,2020-01-09
9,10,Luis,29,10000,Sevilla,
10,11,Carlos,45,11000,Malaga,2020-01-11


In [32]:
#Transformacion de los datos
df['Salario_Log'] = np.log(df['Salario'])
display(df[['Salario','Salario_Log']])

Unnamed: 0,Salario,Salario_Log
0,50000,10.819778
1,1000,6.907755
2,500,6.214608
3,4000,8.29405
4,5000,8.517193
5,6000,8.699515
6,7000,8.853665
7,8000,8.987197
8,9000,9.10498
9,10000,9.21034


In [33]:
#Imputacion de los valores atipicos
df['Salario_Capped'] = np.where(
    (df['Salario'] < lower_bound) | (df['Salario'] > upper_bound),
    df['Salario'].median(),
    df['Salario']
)
display(df[['Salario', 'Salario_Capped']])

Unnamed: 0,Salario,Salario_Capped
0,50000,11500.0
1,1000,1000.0
2,500,500.0
3,4000,4000.0
4,5000,5000.0
5,6000,6000.0
6,7000,7000.0
7,8000,8000.0
8,9000,9000.0
9,10000,10000.0


In [34]:
#Boumin para suavizar los valores atipicos
df['Salario_Binned'] = pd.cut(df['Salario'], bins=5)
display(df[['Salario', 'Salario_Binned']])

Unnamed: 0,Salario,Salario_Binned
0,50000,"(48100.0, 60000.0]"
1,1000,"(440.5, 12400.0]"
2,500,"(440.5, 12400.0]"
3,4000,"(440.5, 12400.0]"
4,5000,"(440.5, 12400.0]"
5,6000,"(440.5, 12400.0]"
6,7000,"(440.5, 12400.0]"
7,8000,"(440.5, 12400.0]"
8,9000,"(440.5, 12400.0]"
9,10000,"(440.5, 12400.0]"


In [36]:
#Modelos robustos con regresion
#Definir dos variables
X = df[['Edad']]
Y = df['Salario']

huber = HuberRegressor()
huber.fit(X,Y)
y_pred = huber.predict(X)
display(y_pred)


array([ 5468.2262161 ,  5901.22507079,  6334.22392547,  6767.22278016,
        7633.22048953, 14128.20330982,  8066.21934422,  8499.2181989 ,
        8932.21705359,  9365.21590827, 16293.19758325, 10231.21361765,
       17159.19529262, 11097.21132702, 11530.2101817 , 17592.1941473 ,
       12396.20789107, 12829.20674576, 13262.20560045, 13695.20445513])