# Handling missing value example

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sns
%matplotlib inline 


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# 1&2. load the data set  Mark Missing Values

In [34]:
df = pd.read_csv("D:\Data Science\Dataset\pima-indians-diabetes.csv", header=None)
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [35]:
# getting the null value count in every column
(df[[1,2,3,4,5,6,7,8]]==0).sum()

1      5
2     35
3    227
4    374
5     11
6      0
7      0
8    500
dtype: int64

In [36]:
# mark 0 values as missing or NaN
df[[1,2,3,4,5,6,7,8]] = df[[1,2,3,4,5,6,7,8]].replace(0, np.NaN)

# count the number of NaN value in each column
df.isnull().sum()

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8    500
dtype: int64

In [37]:
df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1.0
1,1,85.0,66.0,29.0,,26.6,0.351,31,
2,8,183.0,64.0,,,23.3,0.672,32,1.0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0
5,5,116.0,74.0,,,25.6,0.201,30,
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1.0
7,10,115.0,,,,35.3,0.134,29,
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1.0
9,8,125.0,96.0,,,,0.232,54,1.0


# 3. Missing Values Causes Problems

In [38]:
# marks the missing values in the dataset, as we did in the previous section, then attempts to evaluate
# LDA( Linear Discriminant Analysis) using 3-fold cross validation and print the mean accuracy.


# split the dataset into input and output
values = df.values
values

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,     nan],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,     nan],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,     nan]])

In [39]:
x = values[:,0:8] # every row as list of 8 columns
y= values[:,8] # entire column as list

In [40]:
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, x,y,cv=kfold,scoring='accuracy')
result.mean()



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

# 4. Remove Rows With Missing Values

In [41]:
# drop rows with missing values
df.dropna(inplace=True)
print(df.isnull().sum()) # cross verify 

# summarize the number of rows and columns in the dataset
df.shape

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64


(130, 9)

In [42]:
# evaluate an LDA model on the dataset using k-fold cross validation

values = df.values

x = values[:,0:8] # every row as list of 8 columns
y= values[:,8] # entire column as list

model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, x,y,cv=kfold,scoring='accuracy')
print("result : ",result)
print("result mean : ",result.mean())

result :  [1. 1. 1.]
result mean :  1.0


  S**2))[:self._max_components]
  S**2))[:self._max_components]
  S**2))[:self._max_components]


# 5. Impute Missing Values
       Imputing refers to using a model to replace missing values.

In [49]:
df1 = pd.read_csv("D:\Data Science\Dataset\pima-indians-diabetes.csv", header=None)
df1[[1,2,3,4,5,6,7,8]] = df1[[1,2,3,4,5,6,7,8]].replace(0,np.NaN)
df1.fillna(df.mean(), inplace =True)

df1.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64