# Handling missing values

In [17]:
import pandas as pd

In [18]:
data = pd.read_csv("./Resources/blankValues.csv")

In [19]:
data.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [20]:
data.isnull().sum()

A     0
B     0
C     1
D     1
dtype: int64

In [21]:
data.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


### Eliminating features with missing values

In [22]:
data.head()

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,,8.0
2,10,11,12.0,


In [23]:
data1 = data.dropna(axis=0) # tmp delete the values... axis=0 delete the rows

In [24]:
data1 = data.dropna(axis=1) # tmp delete the values.. axis=1 delete the columns
data1

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [25]:
data.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0


In [26]:
data.dropna(axis=1,thresh=3)

Unnamed: 0,A,B
0,1,2
1,5,6
2,10,11


In [27]:
data.dropna(subset=['C'])

KeyError: ['C']

### imputing missing values

In [28]:
from sklearn.impute import SimpleImputer

In [29]:
# pip install scikit-learn

In [30]:
import numpy as np

In [31]:
imr = SimpleImputer(missing_values=np.nan,strategy='mean')
imr = imr.fit(data.values)
imputed_data = imr.transform(data.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

### fill values

##### 2 methods-simple imputer, fillna

In [32]:
data.fillna(data.mean())

Unnamed: 0,A,B,C,D
0,1,2,3.0,4.0
1,5,6,7.5,8.0
2,10,11,12.0,6.0


In [33]:
dt = pd.read_csv("./Resources/ColorShape.csv")
dt

Unnamed: 0,Color,Size,Shape
0,red,S,Circle
1,Yellow,M,Square
2,Green,L,Triangle
3,Green,M,ok
4,,,Square


In [34]:
from sklearn.impute import SimpleImputer

In [35]:
impute=SimpleImputer(strategy='most_frequent')

In [36]:
imputed_data = impute.fit_transform(dt)
imputed_data

array([['red', 'S', 'Circle'],
       ['Yellow', 'M', 'Square'],
       ['Green', 'L', 'Triangle'],
       ['Green', 'M', 'ok'],
       ['Green', 'M', 'Square']], dtype=object)

In [37]:
imputed_df = pd.DataFrame(imputed_data,columns=dt.columns)
imputed_df

Unnamed: 0,Color,Size,Shape
0,red,S,Circle
1,Yellow,M,Square
2,Green,L,Triangle
3,Green,M,ok
4,Green,M,Square


In [38]:
df = pd.DataFrame([['green','M',10.1,'class 1'],['red','L', 13.5,'class 2'],['blue','XL',15.3,'class 1']])
df

Unnamed: 0,0,1,2,3
0,green,M,10.1,class 1
1,red,L,13.5,class 2
2,blue,XL,15.3,class 1


In [39]:
df.columns=['color','size','price', 'classLabel']
df

Unnamed: 0,color,size,price,classLabel
0,green,M,10.1,class 1
1,red,L,13.5,class 2
2,blue,XL,15.3,class 1


### features
#### ordinal feature
#### nominal feature
#### numerical feature
#### class labels feature

In [40]:
size_mapping={'XL':3,'L':2, 'M':1}

In [41]:
df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classLabel
0,green,1,10.1,class 1
1,red,2,13.5,class 2
2,blue,3,15.3,class 1


### mapping of class label using labelEncoder

In [42]:
from sklearn.preprocessing import LabelEncoder

In [43]:
le = LabelEncoder()

In [44]:
y = le.fit_transform(df['classLabel'].values)
y

array([0, 1, 0])

In [45]:
actual_value = le.inverse_transform(y)
actual_value

array(['class 1', 'class 2', 'class 1'], dtype=object)

#### Mapping nominal feature

In [46]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [47]:
data = pd.read_csv("./Resources/ColorShape.csv")
data

Unnamed: 0,Color,Size,Shape
0,red,S,Circle
1,Yellow,M,Square
2,Green,L,Triangle
3,Green,M,ok
4,,,Square


In [48]:
o = OneHotEncoder()

In [49]:
x = data[['Color']]
x

Unnamed: 0,Color
0,red
1,Yellow
2,Green
3,Green
4,


In [50]:
o.fit_transform(x).toarray()

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [51]:
data.Color

0       red
1    Yellow
2     Green
3     Green
4       NaN
Name: Color, dtype: object

In [52]:
pd.get_dummies(data.Color)

Unnamed: 0,Green,Yellow,red
0,False,False,True
1,False,True,False
2,True,False,False
3,True,False,False
4,False,False,False


In [53]:
dummy = pd.get_dummies(data[['Size','Color']]).astype(int)

#### Features
Normalization

In [54]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [55]:
mm = MinMaxScaler()

In [56]:
data = pd.read_csv("./Resources/ColorShape.csv")
data

Unnamed: 0,Color,Size,Shape
0,red,S,Circle
1,Yellow,M,Square
2,Green,L,Triangle
3,Green,M,ok
4,,,Square


In [57]:
d2 = dummy
d2

Unnamed: 0,Size_L,Size_M,Size_S,Color_Green,Color_Yellow,Color_red
0,0,0,1,0,0,1
1,0,1,0,0,1,0
2,1,0,0,1,0,0
3,0,1,0,1,0,0
4,0,0,0,0,0,0


In [58]:
xScaler = mm.fit_transform(d2)
xScaler

array([[0., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
ss = StandardScaler()

In [61]:
XscaleValues = ss.fit_transform(d2)
XscaleValues

array([[-0.5       , -0.81649658,  2.        , -0.81649658, -0.5       ,
         2.        ],
       [-0.5       ,  1.22474487, -0.5       , -0.81649658,  2.        ,
        -0.5       ],
       [ 2.        , -0.81649658, -0.5       ,  1.22474487, -0.5       ,
        -0.5       ],
       [-0.5       ,  1.22474487, -0.5       ,  1.22474487, -0.5       ,
        -0.5       ],
       [-0.5       , -0.81649658, -0.5       , -0.81649658, -0.5       ,
        -0.5       ]])

In [62]:
from sklearn.preprocessing import Binarizer

In [64]:
b = Binarizer(threshold=0.5)
b_data = Binarizer.fit_transform(d2)
b_data

TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

## problem

In [65]:
import pandas as pd

In [14]:
data = pd.read_csv("./Resources/Dataset1.csv")
data

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
0,36,5,5,2297.0
1,55,3,5,1134.0
2,61,8,1,4969.0
3,29,3,6,
4,34,6,2,3574.0
5,61,5,1,1134.0
6,61,5,1,1134.0


In [67]:
data.isnull() # check which are null

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,True
4,False,False,False,False


In [12]:
from sklearn.impute import SimpleImputer
import numpy as np

In [17]:
imr = SimpleImputer(missing_values=np.nan, strategy="mean")
imr = imr.fit(data.values)
imr_data = imr.transform(data.values)
data.fillna(data.mean())

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
0,36,5,5,2297.0
1,55,3,5,1134.0
2,61,8,1,4969.0
3,29,3,6,2373.666667
4,34,6,2,3574.0
5,61,5,1,1134.0
6,61,5,1,1134.0


In [None]:
# Write a difference b/w OneHotEncoder and LabelEncoder
# which featrue scaling method is best suitable for scaling
# -> Binarizer and Standarization

## Removing Duplicate values

In [18]:
import numpy as np
import pandas as pd

In [19]:
data = pd.read_csv("./Resources/Dataset1.csv")
data

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
0,36,5,5,2297.0
1,55,3,5,1134.0
2,61,8,1,4969.0
3,29,3,6,
4,34,6,2,3574.0
5,61,5,1,1134.0
6,61,5,1,1134.0


In [20]:
dup = data[data.duplicated()]
dup

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
6,61,5,1,1134.0


In [21]:
df = data.drop(dup.index, axis=0)
df

Unnamed: 0,AGE,HeathyEating,Active_lifestyle,Salary
0,36,5,5,2297.0
1,55,3,5,1134.0
2,61,8,1,4969.0
3,29,3,6,
4,34,6,2,3574.0
5,61,5,1,1134.0


In [None]:
data = pd.read_csv("./Resources/Iris-f.csv")
data