In [227]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from pandas.api.types import is_numeric_dtype

In [314]:
pcos_dataset = pd.read_csv('pcos_dataset_with_missing_values.csv')
pcos_dataset

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis
0,24.0,34.7,1,25.2,20,0
1,37.0,26.4,0,57.1,'one',0
2,32.0,23.6,0,92.7,28,0
3,28.0,28.8,0,63.1,26,0
4,25.0,22.1,1,59.8,8,0
...,...,...,...,...,...,...
996,34.0,18.4,1,95.7,23,0
997,45.0,28.9,1,28.5,7,0
998,37.0,28.3,0,32.4,28,0
999,41.0,27.3,0,95.6,9,0


In [315]:
# pcos_dataset.dtypes
pcos_dataset.describe()

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),PCOS_Diagnosis
count,1000.0,1001.0,1001.0,1001.0,1001.0
mean,31.771,26.395305,0.53047,60.124575,0.198801
std,8.463462,4.940064,0.49932,23.174978,0.399298
min,18.0,18.1,0.0,20.0,0.0
25%,24.0,21.9,0.0,41.7,0.0
50%,32.0,26.4,1.0,60.0,0.0
75%,39.0,30.5,1.0,80.3,0.0
max,45.0,35.0,1.0,99.8,1.0


In [321]:
pcos_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1000 non-null   float64
 1   BMI                        1001 non-null   float64
 2   Menstrual_Irregularity     1001 non-null   int64  
 3   Testosterone_Level(ng/dL)  1001 non-null   float64
 4   Antral_Follicle_Count      999 non-null    float64
 5   PCOS_Diagnosis             1001 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 47.1 KB


In [319]:
pcos_dataset.isnull().any()

Age                           True
BMI                          False
Menstrual_Irregularity       False
Testosterone_Level(ng/dL)    False
Antral_Follicle_Count         True
PCOS_Diagnosis               False
dtype: bool

In [318]:
# check if a column is numerical or not
# print(is_numeric_dtype(pcos_dataset['Antral_Follicle_Count']))

# changes column type to numeric and turn all unusual values to null
pcos_dataset['Antral_Follicle_Count'] = pd.to_numeric(pcos_dataset['Antral_Follicle_Count'], errors='coerce')

pcos_dataset['Antral_Follicle_Count']
### find all unusual values of column
# invalid_mask = temp.isna()
# invalid_values = pcos_dataset['Antral_Follicle_Count'][invalid_mask].unique()

### one way to fill missing values
# pcos_dataset['Antral_Follicle_Count'] = pcos_dataset['Antral_Follicle_Count'].fillna(0)

0       20.0
1        NaN
2       28.0
3       26.0
4        8.0
        ... 
996     23.0
997      7.0
998     28.0
999      9.0
1000     7.0
Name: Antral_Follicle_Count, Length: 1001, dtype: float64

In [322]:
is_numeric_dtype(pcos_dataset['Antral_Follicle_Count'])

True

In [324]:
# imputer = SimpleImputer(strategy='constant', fill_value=24)
imputer = SimpleImputer(strategy='mean')

# select only numeric types
pcos_nums = pcos_dataset.select_dtypes(include=[np.number])

print(pcos_nums.duplicated().any())
pcos_nums.drop_duplicates(inplace=True)
imputer.fit(pcos_nums)
# imputer.fit(pcos_nums[['Age']])

# check strategy for each column
imputer.statistics_  # equals to:  housing_num.median().values

True


array([31.77877878, 26.387     ,  0.53      , 60.1595    , 17.46793587,
        0.199     ])

In [325]:
pcos_imputer = imputer.transform(pcos_nums)

In [327]:
pd.DataFrame(pcos_imputer, columns=pcos_nums.columns, index=pcos_nums.index)

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis
0,24.0,34.7,1.0,25.2,20.000000,0.0
1,37.0,26.4,0.0,57.1,17.467936,0.0
2,32.0,23.6,0.0,92.7,28.000000,0.0
3,28.0,28.8,0.0,63.1,26.000000,0.0
4,25.0,22.1,1.0,59.8,8.000000,0.0
...,...,...,...,...,...,...
996,34.0,18.4,1.0,95.7,23.000000,0.0
997,45.0,28.9,1.0,28.5,7.000000,0.0
998,37.0,28.3,0.0,32.4,28.000000,0.0
999,41.0,27.3,0.0,95.6,9.000000,0.0
