In [1]:
import pandas as pd
from io import StringIO
import sys

In [2]:
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [4]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [5]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [7]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [9]:
df.dropna(how="any")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [16]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [14]:
df.dropna(subset=["A"])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [18]:
from sklearn.impute import SimpleImputer
import numpy as np

In [30]:
imp_mean=SimpleImputer(missing_values= np.nan, strategy="mean")

#si imputamos por la media la estrategia sería "mean"
#si imputamos por la mediana la estrategia sería "median"
#si imputamos por la moda la estrategia sería "most_frequent"

In [20]:
imp = imp_mean.fit(df.values)

In [21]:
data_imputada = imp_mean.transform(df.values)

In [22]:
data_imputada

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [23]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

#### Caso: Census

In [31]:
FileCSV="D:/Python/1. Nivel I/6/datos/censusn.csv"
censusn=pd.read_csv(FileCSV,  sep=';')
censusn.head()

Unnamed: 0.1,Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,1,39,6.0,77516,13,3,9.0,4,1,1,2174,0,40,1.0,1
1,2,50,2.0,83311,13,1,5.0,3,1,1,0,0,13,1.0,1
2,3,38,1.0,215646,9,2,7.0,4,1,1,0,0,40,1.0,1
3,4,53,1.0,234721,7,1,7.0,3,5,1,0,0,40,1.0,1
4,5,28,1.0,338409,13,1,6.0,1,5,0,0,0,40,13.0,1


In [35]:
round(censusn.isnull().sum()/len(censusn)*100,1)

Unnamed: 0    0.0
V1            0.0
V2            5.6
V3            0.0
V4            0.0
V5            0.0
V6            5.7
V7            0.0
V8            0.0
V9            0.0
V10           0.0
V11           0.0
V12           0.0
V13           1.8
V14           0.0
dtype: float64

In [36]:
imp_moda = SimpleImputer(missing_values=np.nan,strategy="most_frequent")

In [37]:
V2_imp = imp_moda.fit_transform(censusn[["V2"]])

In [40]:
V2_imp_moda = pd.DataFrame(V2_imp, columns=["V2_moda"])
V2_imp_moda.head()

Unnamed: 0,V2_moda
0,6.0
1,2.0
2,1.0
3,1.0
4,1.0


In [41]:
df = pd.concat([censusn,V2_imp_moda],axis=1)

In [44]:
round(df.isnull().sum()/len(df)*100,1)

Unnamed: 0    0.0
V1            0.0
V2            5.6
V3            0.0
V4            0.0
V5            0.0
V6            5.7
V7            0.0
V8            0.0
V9            0.0
V10           0.0
V11           0.0
V12           0.0
V13           1.8
V14           0.0
V2_moda       0.0
dtype: float64

In [45]:
imp_median = SimpleImputer(missing_values = np.nan, strategy="median")

In [46]:
df[["V6","V13"]]=imp_median.fit_transform(censusn[["V6","V13"]])

In [47]:
round(df.isnull().sum()/len(df)*100,1)

Unnamed: 0    0.0
V1            0.0
V2            5.6
V3            0.0
V4            0.0
V5            0.0
V6            0.0
V7            0.0
V8            0.0
V9            0.0
V10           0.0
V11           0.0
V12           0.0
V13           0.0
V14           0.0
V2_moda       0.0
dtype: float64

**Imputación por modelos**

In [48]:
import pandas as pd
import seaborn as sns
import numpy as np

In [49]:
df_Marketing = pd.read_csv('D:/Python/1. Nivel I/6/datos/MarketingDirecto_2.csv',  sep=';')
df_Marketing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Edad       1000 non-null   object 
 1   Genero     1000 non-null   object 
 2   Vivienda   1000 non-null   object 
 3   Ecivil     1000 non-null   object 
 4   Ubicacion  1000 non-null   object 
 5   Salario    1000 non-null   int64  
 6   Hijos      1000 non-null   int64  
 7   Historial  697 non-null    object 
 8   Catalogos  1000 non-null   int64  
 9   Monto      987 non-null    float64
dtypes: float64(1), int64(3), object(6)
memory usage: 78.2+ KB


In [50]:
round(df_Marketing.isnull().sum()/len(df_Marketing)*100,1)

Edad          0.0
Genero        0.0
Vivienda      0.0
Ecivil        0.0
Ubicacion     0.0
Salario       0.0
Hijos         0.0
Historial    30.3
Catalogos     0.0
Monto         1.3
dtype: float64

In [51]:
is_null = pd.isna(df_Marketing.loc[:,"Monto"])

In [53]:
df_nulos = df_Marketing.loc[is_null]

In [54]:
len(df_nulos)

13

In [57]:
is_null2=is_null.replace({True:False, False:True})

In [58]:
df_data = df_Marketing.loc[is_null2]
len(df_data)

987

In [None]:
#df_nulos: filas que no tienen información de la variable "Monto"
#df_data: filas que tienen información de la variable "Monto"

In [59]:
x_entre=df_data[["Salario"]]
y_entre=df_data[["Monto"]]

In [60]:
x_test=df_nulos[["Salario"]]
y_test=df_nulos[["Monto"]]

In [61]:
y_test

Unnamed: 0,Monto
4,
15,
132,
137,
142,
148,
796,
797,
798,
803,


In [62]:
from sklearn.linear_model import LinearRegression

In [63]:
regresion = LinearRegression()

In [64]:
regresion.fit(x_entre, y_entre)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [67]:
y_test_predict = np.round(regresion.predict(x_test),1)

In [69]:
y_test_predict

array([[1490.7],
       [1814.5],
       [ 404.8],
       [1166.9],
       [2268.2],
       [1166.9],
       [2365.1],
       [2347.5],
       [1735.2],
       [ 378.4],
       [ 924.6],
       [1008.3],
       [ 257.2]])

In [70]:
df_monto_imp = pd.DataFrame(data=y_test_predict, columns=["Monto"])

In [71]:
df_monto_imp

Unnamed: 0,Monto
0,1490.7
1,1814.5
2,404.8
3,1166.9
4,2268.2
5,1166.9
6,2365.1
7,2347.5
8,1735.2
9,378.4


In [72]:
df_nulos = df_nulos.drop(["Monto"], axis=1)

In [73]:
df_nulos.head()

Unnamed: 0,Edad,Genero,Vivienda,Ecivil,Ubicacion,Salario,Hijos,Historial,Catalogos
4,Media,Femenino,Propia,Soltero,Cerca,68400,0,Alto,12
15,Media,Femenino,Propia,Casado,Lejos,83100,1,,12
132,Joven,Masculino,Alquilada,Soltero,Cerca,19100,0,,6
137,Media,Femenino,Propia,Soltero,Cerca,53700,3,Bajo,12
142,Media,Masculino,Propia,Casado,Lejos,103700,1,Alto,24


In [None]:
#reset de index

In [74]:
df_nulos = df_nulos.reset_index(drop=True)
df_monto_imp = df_monto_imp.reset_index(drop=True)

In [75]:
df_nulos_imputados=pd.concat([df_nulos,df_monto_imp],axis=1)

In [77]:
df_nulos_imputados.head(10)

Unnamed: 0,Edad,Genero,Vivienda,Ecivil,Ubicacion,Salario,Hijos,Historial,Catalogos,Monto
0,Media,Femenino,Propia,Soltero,Cerca,68400,0,Alto,12,1490.7
1,Media,Femenino,Propia,Casado,Lejos,83100,1,,12,1814.5
2,Joven,Masculino,Alquilada,Soltero,Cerca,19100,0,,6,404.8
3,Media,Femenino,Propia,Soltero,Cerca,53700,3,Bajo,12,1166.9
4,Media,Masculino,Propia,Casado,Lejos,103700,1,Alto,24,2268.2
5,Media,Masculino,Propia,Casado,Cerca,53700,1,,12,1166.9
6,Media,Femenino,Propia,Casado,Cerca,108100,1,Alto,12,2365.1
7,Media,Masculino,Propia,Casado,Cerca,107300,2,Medio,18,2347.5
8,Media,Femenino,Alquilada,Casado,Cerca,79500,0,Alto,12,1735.2
9,Joven,Masculino,Alquilada,Soltero,Cerca,17900,0,,18,378.4


In [78]:
df_nulos_imputados = df_nulos_imputados.reset_index(drop=True)
df_data = df_data.reset_index(drop=True)

In [79]:
data_marketing_final = pd.concat([df_data,df_nulos_imputados],axis=0)

In [80]:
len(data_marketing_final)

1000

In [81]:
data_marketing_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 12
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Edad       1000 non-null   object 
 1   Genero     1000 non-null   object 
 2   Vivienda   1000 non-null   object 
 3   Ecivil     1000 non-null   object 
 4   Ubicacion  1000 non-null   object 
 5   Salario    1000 non-null   int64  
 6   Hijos      1000 non-null   int64  
 7   Historial  697 non-null    object 
 8   Catalogos  1000 non-null   int64  
 9   Monto      1000 non-null   float64
dtypes: float64(1), int64(3), object(6)
memory usage: 85.9+ KB


In [82]:
df_Marketing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Edad       1000 non-null   object 
 1   Genero     1000 non-null   object 
 2   Vivienda   1000 non-null   object 
 3   Ecivil     1000 non-null   object 
 4   Ubicacion  1000 non-null   object 
 5   Salario    1000 non-null   int64  
 6   Hijos      1000 non-null   int64  
 7   Historial  697 non-null    object 
 8   Catalogos  1000 non-null   int64  
 9   Monto      987 non-null    float64
dtypes: float64(1), int64(3), object(6)
memory usage: 78.2+ KB


**Imputación usando árboles de decisión**

In [84]:
is_null = pd.isna(data_marketing_final.loc[:,"Historial"])
df_nulos = data_marketing_final.loc[is_null]

In [85]:
len(df_nulos)

303

In [87]:
is_null2=is_null.replace({True:False, False:True})

In [88]:
df_data = data_marketing_final.loc[is_null2]

In [89]:
len(df_data)

697

In [90]:
x_entre=df_data[["Edad","Genero","Ecivil"]]
y_entre=df_data[["Historial"]]

In [91]:
x_test=df_nulos[["Edad","Genero","Ecivil"]]
y_test=df_nulos[["Historial"]]

In [93]:
##Codificar
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [94]:
d=defaultdict(LabelEncoder)

In [95]:
fit = x_entre.apply(lambda x: d[x.name].fit_transform(x))

In [96]:
x_entre_2=x_entre.apply(lambda x: d[x.name].transform(x))

In [97]:
x_entre_2.head()

Unnamed: 0,Edad,Genero,Ecivil
0,0,0,1
1,2,1,1
2,1,0,1
3,2,1,0
4,1,1,0


In [113]:
from sklearn import tree

In [115]:
tree_dt = tree.DecisionTreeClassifier(criterion='entropy',
                                            min_samples_split=20,
                                            min_samples_leaf=5,
                                            max_depth = 4,
                                            random_state=2021
                                           )

In [116]:
tree_dt.fit(x_entre_2,y_entre)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=2021, splitter='best')

In [117]:
x_test_2 = x_test.apply(lambda x: d[x.name].transform(x))

In [119]:
y_predichos = tree_dt.predict(x_test_2)

In [129]:
y_predichos

array(['Alto', 'Medio', 'Bajo', 'Medio', 'Bajo', 'Alto', 'Alto', 'Alto',
       'Alto', 'Alto', 'Medio', 'Alto', 'Bajo', 'Bajo', 'Medio', 'Alto',
       'Bajo', 'Bajo', 'Medio', 'Bajo', 'Medio', 'Alto', 'Bajo', 'Alto',
       'Medio', 'Bajo', 'Bajo', 'Bajo', 'Alto', 'Alto', 'Bajo', 'Bajo',
       'Bajo', 'Alto', 'Bajo', 'Bajo', 'Bajo', 'Alto', 'Alto', 'Bajo',
       'Bajo', 'Bajo', 'Medio', 'Bajo', 'Medio', 'Bajo', 'Alto', 'Bajo',
       'Bajo', 'Alto', 'Bajo', 'Alto', 'Bajo', 'Bajo', 'Alto', 'Bajo',
       'Alto', 'Bajo', 'Medio', 'Alto', 'Bajo', 'Medio', 'Bajo', 'Alto',
       'Bajo', 'Alto', 'Bajo', 'Bajo', 'Bajo', 'Alto', 'Bajo', 'Medio',
       'Alto', 'Alto', 'Alto', 'Bajo', 'Bajo', 'Bajo', 'Bajo', 'Alto',
       'Bajo', 'Medio', 'Medio', 'Bajo', 'Medio', 'Bajo', 'Alto', 'Alto',
       'Bajo', 'Alto', 'Medio', 'Medio', 'Medio', 'Bajo', 'Medio', 'Bajo',
       'Alto', 'Medio', 'Bajo', 'Bajo', 'Bajo', 'Alto', 'Alto', 'Bajo',
       'Alto', 'Bajo', 'Alto', 'Medio', 'Bajo', 'Alto', '

In [121]:
df_y_predichos = pd.DataFrame(data=y_predichos,columns=["Historial"])

In [122]:
df_nulos = df_nulos.drop(["Historial"], axis=1)

In [123]:
df_y_predichos=df_y_predichos.reset_index(drop=True)
df_nulos=df_nulos.reset_index(drop=True)

In [124]:
df_nulos_imputados = pd.concat([df_nulos,df_y_predichos], axis=1)

In [125]:
df_nulos_imputados.isnull().sum()

Edad         0
Genero       0
Vivienda     0
Ecivil       0
Ubicacion    0
Salario      0
Hijos        0
Catalogos    0
Monto        0
Historial    0
dtype: int64

In [126]:
df_data = df_data.reset_index(drop=True)
df_nulos_imputados =df_nulos_imputados.reset_index(drop=True)

In [127]:
df_marketing_final_final = pd.concat([df_data,df_nulos_imputados],axis=0)

In [128]:
df_marketing_final_final.isnull().sum()/len(df_marketing_final_final)*100

Edad         0.0
Genero       0.0
Vivienda     0.0
Ecivil       0.0
Ubicacion    0.0
Salario      0.0
Hijos        0.0
Historial    0.0
Catalogos    0.0
Monto        0.0
dtype: float64

In [130]:
df_marketing_final_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 302
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Edad       1000 non-null   object 
 1   Genero     1000 non-null   object 
 2   Vivienda   1000 non-null   object 
 3   Ecivil     1000 non-null   object 
 4   Ubicacion  1000 non-null   object 
 5   Salario    1000 non-null   int64  
 6   Hijos      1000 non-null   int64  
 7   Historial  1000 non-null   object 
 8   Catalogos  1000 non-null   int64  
 9   Monto      1000 non-null   float64
dtypes: float64(1), int64(3), object(6)
memory usage: 85.9+ KB
