# Handling Missing Value on Diabetes Dataset

In [13]:
# Import necessary libraries, loading the diabetes dataset and displaying info about dataset
import pandas as pd
import numpy as np
df = pd.read_csv("/content/diabetes.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [14]:
# Displaying the first five rows of the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
# Replacing 0 values with NaN in specific columns
df_with_nan = df.iloc[:,[0,1,2,3,4,5,6,7,8]].replace(0,np.nan)
df_with_nan.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,
2,8.0,183.0,64.0,,,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0


In [16]:
# Counting the number of missing values in each column
df_with_nan.isnull().sum()

Unnamed: 0,0
Pregnancies,111
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,500


In [17]:
# Calculating the percentage of missing values in each column
df_with_nan.isnull().sum()*100/len(df)

Unnamed: 0,0
Pregnancies,14.453125
Glucose,0.651042
BloodPressure,4.557292
SkinThickness,29.557292
Insulin,48.697917
BMI,1.432292
DiabetesPedigreeFunction,0.0
Age,0.0
Outcome,65.104167


In [18]:
# Filling missing values with -999
df_with_nan.fillna(-999)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,-999.0,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,-999.0,26.6,0.351,31,-999.0
2,8.0,183.0,64.0,-999.0,-999.0,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,-999.0
4,-999.0,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63,-999.0
764,2.0,122.0,70.0,27.0,-999.0,36.8,0.340,27,-999.0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30,-999.0
766,1.0,126.0,60.0,-999.0,-999.0,30.1,0.349,47,1.0


In [19]:
# Filling missing values with NaN
df_with_nan.fillna(np.nan)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,,26.6,0.351,31,
2,8.0,183.0,64.0,,,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,
4,,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63,
764,2.0,122.0,70.0,27.0,,36.8,0.340,27,
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30,
766,1.0,126.0,60.0,,,30.1,0.349,47,1.0


In [20]:
# Filling missing values with "Data Missing"
df_with_nan.fillna("Data Missing")

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,Data Missing,33.6,0.627,50,1.0
1,1.0,85.0,66.0,29.0,Data Missing,26.6,0.351,31,Data Missing
2,8.0,183.0,64.0,Data Missing,Data Missing,23.3,0.672,32,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,Data Missing
4,Data Missing,137.0,40.0,35.0,168.0,43.1,2.288,33,1.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63,Data Missing
764,2.0,122.0,70.0,27.0,Data Missing,36.8,0.340,27,Data Missing
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30,Data Missing
766,1.0,126.0,60.0,Data Missing,Data Missing,30.1,0.349,47,1.0


In [21]:
# Dropping rows with missing values
df_with_nan.dropna(axis=0)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1.0
13,1.0,189.0,60.0,23.0,846.0,30.1,0.398,59,1.0
14,5.0,166.0,72.0,19.0,175.0,25.8,0.587,51,1.0
19,1.0,115.0,70.0,30.0,96.0,34.6,0.529,32,1.0
...,...,...,...,...,...,...,...,...,...
730,3.0,130.0,78.0,23.0,79.0,28.4,0.323,34,1.0
732,2.0,174.0,88.0,37.0,120.0,44.5,0.646,24,1.0
740,11.0,120.0,80.0,37.0,150.0,42.3,0.785,48,1.0
748,3.0,187.0,70.0,22.0,200.0,36.4,0.408,36,1.0


In [22]:
# Dropping columns with missing values
df_with_nan.dropna(axis=1)

Unnamed: 0,DiabetesPedigreeFunction,Age
0,0.627,50
1,0.351,31
2,0.672,32
3,0.167,21
4,2.288,33
...,...,...
763,0.171,63
764,0.340,27
765,0.245,30
766,0.349,47


In [24]:
# Dropping columns with less than 300 non-missing values
df_with_nan.dropna(axis=1, thresh =300)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,,33.6,0.627,50
1,1.0,85.0,66.0,29.0,,26.6,0.351,31
2,8.0,183.0,64.0,,,23.3,0.672,32
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21
4,,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2.0,122.0,70.0,27.0,,36.8,0.340,27
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1.0,126.0,60.0,,,30.1,0.349,47


In [25]:
# Filling missing values with the mean of each column
df_with_nan.fillna(df_with_nan.mean())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.000000,148.0,72.0,35.00000,155.548223,33.6,0.627,50,1.0
1,1.000000,85.0,66.0,29.00000,155.548223,26.6,0.351,31,1.0
2,8.000000,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1.0
3,1.000000,89.0,66.0,23.00000,94.000000,28.1,0.167,21,1.0
4,4.494673,137.0,40.0,35.00000,168.000000,43.1,2.288,33,1.0
...,...,...,...,...,...,...,...,...,...
763,10.000000,101.0,76.0,48.00000,180.000000,32.9,0.171,63,1.0
764,2.000000,122.0,70.0,27.00000,155.548223,36.8,0.340,27,1.0
765,5.000000,121.0,72.0,23.00000,112.000000,26.2,0.245,30,1.0
766,1.000000,126.0,60.0,29.15342,155.548223,30.1,0.349,47,1.0
