In [1]:
from collections import Counter
import numpy as np
import pandas as pd

In [2]:
# Load the data

data = pd.read_csv('dataset.csv', sep=',')
pd.options.mode.chained_assignment = None
len_data = len(data)

In [3]:
# Show generic information

print('-------------------------------')
data.info()
print('-------------------------------')
print('Count of records: {}'.format(len_data))
print('-------------------------------')

-------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1665 entries, 0 to 1664
Data columns (total 80 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   ClientID                    1665 non-null   object
 1   CycleNumber                 1665 non-null   int64 
 2   Group                       1665 non-null   int64 
 3   CycleWithPeakorNot          1665 non-null   int64 
 4   ReproductiveCategory        1665 non-null   int64 
 5   LengthofCycle               1665 non-null   int64 
 6   MeanCycleLength             1665 non-null   object
 7   EstimatedDayofOvulation     1665 non-null   object
 8   LengthofLutealPhase         1665 non-null   object
 9   FirstDayofHigh              1665 non-null   object
 10  TotalNumberofHighDays       1665 non-null   object
 11  TotalHighPostPeak           1665 non-null   object
 12  TotalNumberofPeakDays       1665 non-null   object
 13  TotalDaysofFerti

In [4]:
# +----------+
# | Patients |
# +----------+


patient_data_row_counter = Counter()
for id, row in data['ClientID'].iteritems():
    patient_data_row_counter[row] += 1
per_user_rows = sorted(set(patient_data_row_counter.values()))
print('Unique patients: {}'.format(data['ClientID'].nunique()))
print('Data row count per patient between {} - {}'.format(per_user_rows[0], per_user_rows[-1]))

Unique patients: 159
Data row count per patient between 1 - 45


In [5]:
# +--------------+
# | Cycle length |
# +--------------+

print('Cycle length between {} - {}'.format(data['LengthofCycle'].min(), data['LengthofCycle'].max()))

Cycle length between 18 - 54


In [6]:
# +---------------------+
# | Luteal phase length |
# +---------------------+



clean_luteal_data = data[data['LengthofLutealPhase'] != ' ']
clean_luteal_data['LengthofLutealPhase'] = pd.to_numeric(clean_luteal_data['LengthofLutealPhase'], downcast='unsigned')
print('Luteal phase length between {} - {} days.'.format(clean_luteal_data['LengthofLutealPhase'].min(),
                                                         clean_luteal_data['LengthofLutealPhase'].max()))
print('--- {} records are good from {}.'.format(len(clean_luteal_data), len_data))

Luteal phase length between 1 - 41 days.
--- 1514 records are good from 1665.


In [7]:
# +-----------------+
# | Ovulation's day |
# +-----------------+



clean_ovulation_data = data[data['EstimatedDayofOvulation'] != ' ']
clean_ovulation_data['EstimatedDayofOvulation'] = pd.to_numeric(clean_ovulation_data['EstimatedDayofOvulation'], downcast='unsigned')
print('Estimated day of ovulation between {} - {} days.'.format(clean_ovulation_data['EstimatedDayofOvulation'].min(),
                                                                clean_ovulation_data['EstimatedDayofOvulation'].max()))
print('--- {} records are good from {}.'.format(len(clean_ovulation_data), len_data))

Estimated day of ovulation between 6 - 29 days.
--- 1515 records are good from 1665.


In [8]:
# +------------------+
# | Length of menses |
# +------------------+


clean_menses_data = data[data['LengthofMenses'] != ' ']
clean_menses_data['LengthofMenses'] = pd.to_numeric(clean_menses_data['LengthofMenses'], downcast='unsigned')
print('Length of menses between {} - {} days.'.format(clean_menses_data['LengthofMenses'].min(),
                                                      clean_menses_data['LengthofMenses'].max()))
print('--- {} records are good from {}.'.format(len(clean_menses_data), len_data))

Length of menses between 2 - 15 days.
--- 1661 records are good from 1665.


In [9]:
# +-----+
# | Age |
# +-----+


clean_age_data = data[data['AgeM'] != ' ']
clean_age_data['Age'] = pd.to_numeric(clean_age_data['Age'], downcast='unsigned')
print('Age between {} - {} years.'.format(clean_age_data['Age'].min(),
                                          clean_age_data['Age'].max()))
print('--- {} records are good from {}.'.format(len(clean_age_data), len_data))
age_of_patient = {}
for id, row in clean_age_data[['ClientID', 'Age']].iterrows():
    if row['ClientID'] not in age_of_patient.keys():
        age_of_patient[row['ClientID']] = row['Age']
    else:
        if age_of_patient[row['ClientID']] != row['Age']:
            print('[!] User {} has different age values old: {}, new {}.'.format(row['ClientID'],
                                                                                 age_of_patient[row['ClientID']],
                                                                                 row['Age']))
print('{} patients have age value.'.format(len(age_of_patient)))

Age between 21 - 43 years.
--- 142 records are good from 1665.
138 patients have age value.


In [10]:
#######################
# Creating clean data #
#######################


clean_data = data[data['LengthofLutealPhase'] != ' ']
clean_data = clean_data[clean_data['LengthofMenses'] != ' ']
clean_data = clean_data[clean_data['EstimatedDayofOvulation'] != ' ']

clean_data = clean_data[['ClientID', 'LengthofCycle', 'LengthofLutealPhase', 'LengthofMenses' , 'EstimatedDayofOvulation']]
for i, key in enumerate(patient_data_row_counter.keys()):
    clean_data = clean_data.replace(key, i)
print('-------------------------------')
clean_data.info()
print('-------------------------------')
print('Count of records: {}'.format(len(clean_data)))
print('-------------------------------')
clean_data.to_csv('clean_dataset.csv', sep=',', index=False)

-------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 0 to 1663
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   ClientID                 1512 non-null   int64 
 1   LengthofCycle            1512 non-null   int64 
 2   LengthofLutealPhase      1512 non-null   object
 3   LengthofMenses           1512 non-null   object
 4   EstimatedDayofOvulation  1512 non-null   object
dtypes: int64(2), object(3)
memory usage: 70.9+ KB
-------------------------------
Count of records: 1512
-------------------------------
