Identifying missing values in tabular data

In [1]:
import pandas as pd
from io import StringIO

In [7]:
csv_data=\
  '''
  A,B,C,D
  1.0,2.0,3.0,4.0
  5.0,6.0,,8.0
  10.0,11.0,12.0,
  '''

In [8]:
df=pd.read_csv(StringIO(csv_data))

In [9]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [10]:
df.isnull().sum()

  A    0
B      0
C      1
D      1
dtype: int64

In [12]:
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,   8.],
       [ 10.,  11.,  12.,  nan]])

Eliminating sample or features with missing values

In [14]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [16]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [17]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [18]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [19]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


Imputing missing value

In [21]:
from sklearn.preprocessing import Imputer

In [22]:
Imr=Imputer(missing_values='NaN',strategy='mean',axis=0)

In [23]:
Imr=Imr.fit(df.values)

In [24]:
imputed_data=Imr.transform(df.values)

In [26]:
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [ 10. ,  11. ,  12. ,   6. ]])

Handling categorical data

In [28]:
import pandas as pd

In [29]:
df=pd.DataFrame([
    ['green','M',10.1,'class1'],
    ['red','L',13.5,'class2'],
    ['blue','XL',15.3,'class1']])

In [32]:
df.columns=['color','size','price','class label']

In [33]:
size_mapping={
           'XL':3,
            'L':2,
            'M':1
}

In [35]:
df['size']=df['size'].map(size_mapping)

In [36]:
df

Unnamed: 0,color,size,price,class label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [38]:
inv_size_mapping={v:k for k,v in size_mapping.items()}

In [39]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

Encoding class labels

In [41]:
import numpy as np

In [42]:
class_mapping={label:idx for idx , label in enumerate(np.unique(df['class label']))}

In [43]:
class_mapping

{'class1': 0, 'class2': 1}

In [44]:
df['class label']=df['class label'].map(class_mapping)

In [45]:
df

Unnamed: 0,color,size,price,class label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [46]:
inv_class_mapping={v:k for k,v in class_mapping.items()}

In [47]:
df['class label']=df['class label'].map(inv_class_mapping)

In [48]:
df

Unnamed: 0,color,size,price,class label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [49]:
from sklearn.preprocessing import LabelEncoder

In [50]:
class_le=LabelEncoder()

In [51]:
y=class_le.fit_transform(df['class label'].values)

In [52]:
y

array([0, 1, 0], dtype=int64)

In [53]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)