In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pylab import rcParams
from skimpy import clean_columns

rcParams["figure.figsize"] = (12,8)
%matplotlib inline

sns.set()

In [2]:
# Load data
fert = pd.read_csv("pcos_fertility.csv")
infert = pd.read_excel("pcos_no_infertility.xlsx", sheet_name="Full_new")

In [3]:
fert.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),I beta-HCG(mIU/mL),II beta-HCG(mIU/mL),AMH(ng/mL)
0,1,10001,0,1.99,1.99,2.07
1,2,10002,0,60.8,1.99,1.53
2,3,10003,1,494.08,494.08,6.63
3,4,10004,0,1.99,1.99,1.22
4,5,10005,0,801.45,801.45,2.26


In [4]:
fert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3     I   beta-HCG(mIU/mL)  541 non-null    float64
 4   II    beta-HCG(mIU/mL)  541 non-null    float64
 5   AMH(ng/mL)              541 non-null    object 
dtypes: float64(2), int64(3), object(1)
memory usage: 25.5+ KB


In [5]:
infert.head().T

Unnamed: 0,0,1,2,3,4
Sl. No,1.0,2.0,3.0,4.0,5.0
Patient File No.,1.0,2.0,3.0,4.0,5.0
PCOS (Y/N),0.0,0.0,1.0,0.0,0.0
Age (yrs),28.0,36.0,33.0,37.0,25.0
Weight (Kg),44.6,65.0,68.8,65.0,52.0
Height(Cm),152.0,161.5,165.0,148.0,161.0
BMI,19.3,24.921163,25.270891,29.674945,20.060954
Blood Group,15.0,15.0,11.0,13.0,11.0
Pulse rate(bpm),78.0,74.0,72.0,72.0,72.0
RR (breaths/min),22.0,20.0,18.0,20.0,18.0


In [6]:
infert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 45 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

In [7]:
pcos = infert.merge(fert, on="Sl. No",suffixes=("_x","_y"), how="left")
pcos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 540
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sl. No                    541 non-null    int64  
 1   Patient File No._x        541 non-null    int64  
 2   PCOS (Y/N)_x              541 non-null    int64  
 3    Age (yrs)                541 non-null    int64  
 4   Weight (Kg)               541 non-null    float64
 5   Height(Cm)                541 non-null    float64
 6   BMI                       541 non-null    float64
 7   Blood Group               541 non-null    int64  
 8   Pulse rate(bpm)           541 non-null    int64  
 9   RR (breaths/min)          541 non-null    int64  
 10  Hb(g/dl)                  541 non-null    float64
 11  Cycle(R/I)                541 non-null    int64  
 12  Cycle length(days)        541 non-null    int64  
 13  Marraige Status (Yrs)     540 non-null    float64
 14  Pregnant(Y

In [8]:
#column name clean up
pcos_clean = clean_columns(pcos)


In [9]:
[x for x in pcos_clean.columns if pcos_clean[x].isnull().sum() > 0]

['marraige_status_yrs', 'fast_food_y_n', 'unnamed_44']

In [10]:
pd.DataFrame({"Null": pcos_clean.isnull().sum(), "%Null" : (pcos_clean.isnull().sum()/pcos_clean.shape[0])*100})

Unnamed: 0,Null,%Null
sl_no,0,0.0
patient_file_no_x,0,0.0
pcos_y_n_x,0,0.0
age_yrs,0,0.0
weight_kg,0,0.0
height_cm,0,0.0
bmi,0,0.0
blood_group,0,0.0
pulse_rate_bpm,0,0.0
rr_breaths_min,0,0.0


In [11]:
# remove columns where we have more than 90% of missing values
# pcos_clean.drop("unnamed_44", axis=1, inplace=True)
pd.DataFrame({"Null": pcos_clean.isnull().sum(), "%Null" : (pcos_clean.isnull().sum()/pcos_clean.shape[0])*100})


Unnamed: 0,Null,%Null
sl_no,0,0.0
patient_file_no_x,0,0.0
pcos_y_n_x,0,0.0
age_yrs,0,0.0
weight_kg,0,0.0
height_cm,0,0.0
bmi,0,0.0
blood_group,0,0.0
pulse_rate_bpm,0,0.0
rr_breaths_min,0,0.0


In [40]:
pcos_clean["pcos_y_n_x"].equals(pcos_clean["pcos_y_n_y"])

def equal_cols(col_1, col_2):
    for col_1,col_2 in pcos_clean:
        if pcos_clean[col_1].equals(pcos_clean[col_2]):
            return True, True