In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pylab import rcParams
from skimpy import clean_columns

rcParams["figure.figsize"] = (12,8)
%matplotlib inline

sns.set()

In [2]:
# Load data
fert = pd.read_csv("pcos_fertility.csv")
infert = pd.read_excel("pcos_no_infertility.xlsx", sheet_name="Full_new")

In [3]:
fert.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),I beta-HCG(mIU/mL),II beta-HCG(mIU/mL),AMH(ng/mL)
0,1,10001,0,1.99,1.99,2.07
1,2,10002,0,60.8,1.99,1.53
2,3,10003,1,494.08,494.08,6.63
3,4,10004,0,1.99,1.99,1.22
4,5,10005,0,801.45,801.45,2.26


In [4]:
fert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3     I   beta-HCG(mIU/mL)  541 non-null    float64
 4   II    beta-HCG(mIU/mL)  541 non-null    float64
 5   AMH(ng/mL)              541 non-null    object 
dtypes: float64(2), int64(3), object(1)
memory usage: 25.5+ KB


In [5]:
infert.head().T

Unnamed: 0,0,1,2,3,4
Sl. No,1.0,2.0,3.0,4.0,5.0
Patient File No.,1.0,2.0,3.0,4.0,5.0
PCOS (Y/N),0.0,0.0,1.0,0.0,0.0
Age (yrs),28.0,36.0,33.0,37.0,25.0
Weight (Kg),44.6,65.0,68.8,65.0,52.0
Height(Cm),152.0,161.5,165.0,148.0,161.0
BMI,19.3,24.921163,25.270891,29.674945,20.060954
Blood Group,15.0,15.0,11.0,13.0,11.0
Pulse rate(bpm),78.0,74.0,72.0,72.0,72.0
RR (breaths/min),22.0,20.0,18.0,20.0,18.0


In [6]:
infert.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 45 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

In [7]:
pcos = infert.merge(fert, on="Sl. No",suffixes=("_x","_y"), how="left")
pcos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 540
Data columns (total 50 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sl. No                    541 non-null    int64  
 1   Patient File No._x        541 non-null    int64  
 2   PCOS (Y/N)_x              541 non-null    int64  
 3    Age (yrs)                541 non-null    int64  
 4   Weight (Kg)               541 non-null    float64
 5   Height(Cm)                541 non-null    float64
 6   BMI                       541 non-null    float64
 7   Blood Group               541 non-null    int64  
 8   Pulse rate(bpm)           541 non-null    int64  
 9   RR (breaths/min)          541 non-null    int64  
 10  Hb(g/dl)                  541 non-null    float64
 11  Cycle(R/I)                541 non-null    int64  
 12  Cycle length(days)        541 non-null    int64  
 13  Marraige Status (Yrs)     540 non-null    float64
 14  Pregnant(Y

In [8]:
#column name clean up
pcos_clean = clean_columns(pcos)


In [9]:
[x for x in pcos_clean.columns if pcos_clean[x].isnull().sum() > 0]

['marraige_status_yrs', 'fast_food_y_n', 'unnamed_44']

In [10]:
pd.DataFrame({"Null": pcos_clean.isnull().sum(), "%Null" : (pcos_clean.isnull().sum()/pcos_clean.shape[0])*100})

Unnamed: 0,Null,%Null
sl_no,0,0.0
patient_file_no_x,0,0.0
pcos_y_n_x,0,0.0
age_yrs,0,0.0
weight_kg,0,0.0
height_cm,0,0.0
bmi,0,0.0
blood_group,0,0.0
pulse_rate_bpm,0,0.0
rr_breaths_min,0,0.0


In [11]:
# remove columns where we have more than 90% of missing values
pcos_clean.drop("unnamed_44", axis=1, inplace=True)

Unnamed: 0,Null,%Null
sl_no,0,0.0
patient_file_no_x,0,0.0
pcos_y_n_x,0,0.0
age_yrs,0,0.0
weight_kg,0,0.0
height_cm,0,0.0
bmi,0,0.0
blood_group,0,0.0
pulse_rate_bpm,0,0.0
rr_breaths_min,0,0.0


In [12]:
# pcos_clean["pcos_y_n_x"].equals(pcos_clean["pcos_y_n_y"])

def equal_cols(col_1, col_2):
    if pcos_clean[col_1].equals(pcos_clean[col_2]):
        return (f"{col_1} and {col_2} are equal")
    else:
        return f"{col_1} and {col_2} are not equal"    

print(equal_cols("pcos_y_n_x","pcos_y_n_y"))
print(equal_cols("i_beta_hcg_m_iu_m_l_x","i_beta_hcg_m_iu_m_l_y"))
print(equal_cols("ii_beta_hcg_m_iu_m_l_x","ii_beta_hcg_m_iu_m_l_y"))
print(equal_cols("amh_ng_m_l_x","amh_ng_m_l_y"))


pcos_y_n_x and pcos_y_n_y are equal
i_beta_hcg_m_iu_m_l_x and i_beta_hcg_m_iu_m_l_y are equal
ii_beta_hcg_m_iu_m_l_x and ii_beta_hcg_m_iu_m_l_y are not equal
amh_ng_m_l_x and amh_ng_m_l_y are not equal


In [14]:
# removing columns that are equal as well as those from the y-dataset
pcos_clean.drop(["pcos_y_n_y","i_beta_hcg_m_iu_m_l_y","patient_file_no_y","ii_beta_hcg_m_iu_m_l_y",
                "amh_ng_m_l_y","patient_file_no_x"], axis = 1, inplace=True)

In [16]:
#address missing values


<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 540
Data columns (total 43 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   sl_no                   541 non-null    int64  
 1   pcos_y_n_x              541 non-null    int64  
 2   age_yrs                 541 non-null    int64  
 3   weight_kg               541 non-null    float64
 4   height_cm               541 non-null    float64
 5   bmi                     541 non-null    float64
 6   blood_group             541 non-null    int64  
 7   pulse_rate_bpm          541 non-null    int64  
 8   rr_breaths_min          541 non-null    int64  
 9   hb_g_dl                 541 non-null    float64
 10  cycle_r_i               541 non-null    int64  
 11  cycle_length_days       541 non-null    int64  
 12  marraige_status_yrs     540 non-null    float64
 13  pregnant_y_n            541 non-null    int64  
 14  no_of_aborptions        541 non-null    in

In [26]:
for col in ["marraige_status_yrs","fast_food_y_n"]:
    pcos_clean[col].fillna(pcos_clean[col].mode()[0], inplace = True)

In [27]:
pcos_clean.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sl_no,541.0,271.0,156.317519,1.0,136.0,271.0,406.0,541.0
pcos_y_n_x,541.0,0.327172,0.469615,0.0,0.0,0.0,1.0,1.0
age_yrs,541.0,31.430684,5.411006,20.0,28.0,31.0,35.0,48.0
weight_kg,541.0,59.637153,11.028287,31.0,52.0,59.0,65.0,108.0
height_cm,541.0,156.484835,6.033545,137.0,152.0,156.0,160.0,180.0
bmi,541.0,24.311285,4.056399,12.417882,21.641274,24.238227,26.634958,38.9
blood_group,541.0,13.802218,1.840812,11.0,13.0,14.0,15.0,18.0
pulse_rate_bpm,541.0,73.247689,4.430285,13.0,72.0,72.0,74.0,82.0
rr_breaths_min,541.0,19.243993,1.688629,16.0,18.0,18.0,20.0,28.0
hb_g_dl,541.0,11.160037,0.866904,8.5,10.5,11.0,11.7,14.8


In [29]:
pcos_clean.columns

Index(['sl_no', 'pcos_y_n_x', 'age_yrs', 'weight_kg', 'height_cm', 'bmi',
       'blood_group', 'pulse_rate_bpm', 'rr_breaths_min', 'hb_g_dl',
       'cycle_r_i', 'cycle_length_days', 'marraige_status_yrs', 'pregnant_y_n',
       'no_of_aborptions', 'i_beta_hcg_m_iu_m_l_x', 'ii_beta_hcg_m_iu_m_l_x',
       'fsh_m_iu_m_l', 'lh_m_iu_m_l', 'fsh_lh', 'hip_inch', 'waist_inch',
       'waist_hip_ratio', 'tsh_m_iu_l', 'amh_ng_m_l_x', 'prl_ng_m_l',
       'vit_d_3_ng_m_l', 'prg_ng_m_l', 'rbs_mg_dl', 'weight_gain_y_n',
       'hair_growth_y_n', 'skin_darkening_y_n', 'hair_loss_y_n', 'pimples_y_n',
       'fast_food_y_n', 'reg_exercise_y_n', 'bp_systolic_mm_hg',
       'bp_diastolic_mm_hg', 'follicle_no_l', 'follicle_no_r',
       'avg_f_size_l_mm', 'avg_f_size_r_mm', 'endometrium_mm'],
      dtype='object')

### EDA

Questions to Answer

What are the factors associated with PCOS?

* Lifestyle - Exercise, Fast Foods and BMI.
* Secondary - Hip to waist ration, regular cycles and cycle lenght, hirsuitism and acne, Weight gain, Blood Pressure.
* Primary - No of follicles and follicle size, FSH and  LH, HCG Levels(Human Chronic Gonadritophic),TSH(Thyroid Stimulating Hormone),AMH(Anti Mullerina Hormone),High Prolactin, Random Blood Sugar(RBS)

* Hypo or Hyperthyroidism can affect both FSH,lh and Prolactin, Affect Insulin Sensitivity