# Polycystic Ovary Syndrome (PCOS)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import pingouin as pg
#from scipy import stats

# Introduction

Polycystic ovary syndrome, or PCOS, is one of the most common causes of female infertility, afffecting as many as 5 million American women who are of childbearing age. Women who have PCOS produce more male hormones than what is considered to be normal, which may impact their overall heath, even past their childbearing years. Symptoms can be different for every woman, which makes it very difficult to diagnose. This analysis explores various PCOS symptoms or variables that increase the likelihood of a PCOS diagnosis or infertility. In this analysis, I will explore three specific questions:

Are there any features that are correlated with PCOS?

What are the most frequent symptoms PCOS pateints exihibit?

Do non-PCOS patients exhibit similar symptoms to those diagnosed with PCOS?

# About the Data

The original data set and notebook can be found on Kaggle. The data set contains one Comma Separated Values (CSV) file and one Excel file.

PCOS_Data_without_infertility: Contains 45 columns (representing different parameters) and 541 rows (representing different patients identified by a Patient File Number). Excel file type.

PCOS_infertility: Contains 6 columns (representing different parameters) and 541 rows (representing different patients idenfitied by a Patient File Number). CSV file type.

# Data Inspection

Before exploring the questions of interest in the data, we will inspect it to get a sense of it's general construct. In the data inspection, we will complete the following tasks:

Load the data.

Describe the data (shape, structure and descriptive statistics).

Inspect the data including missing values or NaN values.

Make intial observations about the data for subsequent steps such as data cleaning and pre-processing.

Before we begin the inspection, we first import the necessary libraries for data analysis:

numpy as np: Used for linear algebra or matrix math.

pandas as pd: Used for data analysis in a tabular structure.

matplotlib.pyplot as plt: Used for plotting data.

seaborn as sns: Subpackage of Matplotlib used for statistical data visualization.

stats from scipy: Used as the base package for pingouin

pingouin as pg: Used for inferential statistics and statistical analysis.

In [None]:
pcos_with_inf = pd.read_csv("PCOS_infertility.csv")
pcos_without_inf = pd.read_csv("PCOS_fertility.csv")

In [None]:
pcos_with_inf

In [None]:
pcos_without_inf

In [None]:
pcos_with_inf.shape

In [None]:
pcos_without_inf.shape

In [None]:
pcos_without_inf.info()

**Missing Values**

In [None]:
pcos_without_inf.isna().any()

**Dropping all the Nan values**

In [None]:
pcos_without_inf = pcos_without_inf.dropna(axis = 0, how = 'all').dropna(axis = 1, how = 'any')
pcos_without_inf

In [None]:
#To check whether all Nan values have been removed
pcos_without_inf.isna().any()

**Removing all the white spaces**

In [None]:
pcos_without_inf.columns = [col.strip() for col in pcos_without_inf.columns]
pcos_without_inf.columns

In [None]:
plt.figure(figsize=(20,10))
heatmap = sns.heatmap(pcos_without_inf.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap for PCOS', fontdict={'fontsize':18}, pad=5);

**As we can see from the correlation heatmap, we can't understand that which factor is responsible for PCOS. We will perform ANOVA testing.**

In [None]:
pcos_without_inf_anova= pcos_without_inf[['PCOS (Y/N)', 'BMI',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle length(days)',
       'II    beta-HCG(mIU/mL)',
       'FSH(mIU/mL)', 'LH(mIU/mL)', 'Waist(inch)',
       'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)',
       'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)']]

In [None]:
#Renaming the columns for better understanding
pcos_without_inf_anova=pcos_without_inf_anova.rename(columns = {'PCOS (Y/N)':'PCOS','Pulse rate(bpm)':'Pulse rate',
       'RR (breaths/min)':'RR', 'Hb(g/dl)':'HB', 'Cycle length(days)':'Cycle length',
        'I   beta-HCG(mIU/mL)':'I   beta-HCG', 'II    beta-HCG(mIU/mL)':'II    beta-HCG',
       'FSH(mIU/mL)':'FSH', 'LH(mIU/mL)':'LH',
        'TSH (mIU/L)':'TSH', 'AMH(ng/mL)':'AMH', 'PRL(ng/mL)':'PRL',
       'Vit D3 (ng/mL)':'Vit D3', 'PRG(ng/mL)':'PRL', 'RBS(mg/dl)':'RBS', 'BP _Systolic (mmHg)':'BP _Systolic',
       'BP _Diastolic (mmHg)':'BP _Diastolic', 'Follicle No. (R)':'Follicle No.',
       'Avg. F size (L) (mm)':'Avg. F size (L)', 'Avg. F size (R) (mm)':'Avg. F size (R)', 'Endometrium (mm)':'Endometrium', 'Height (m)':'Height'})

In [None]:
pcos_without_inf_anova.shape

Hypothetis Testing

In [None]:
pg.anova(dv = 'PCOS', between = ['Age'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['BMI'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['Resp_Rate'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['Hemoglobin'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['Waist_Hip_Ratio'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['Vit_D'], data = pcos_without_inf_anova)

In [None]:
pg.anova(dv = 'PCOS', between = ['Height'], data = pcos_without_inf_anova)

In [None]:
sns.displot(data = PCOS_woinf_ANOVA, x = "Foll_No_R", hue = 'PCOS', kde = True).set(title = "PCOS vs. non-PCOS Right Ovary Follicle Count")

In [None]:
sns.displot(data = PCOS_woinf_ANOVA, x = "Cycle_Length", hue = 'PCOS', kde = True).set(title = "PCOS vs. non-PCOS Menstrual Cycle Duration")

In [None]:
sns.displot(data = PCOS_woinf_ANOVA, x = "Anti_Mull_Horm", hue = 'PCOS', kde = True).set(title = "PCOS vs. non-PCOS Anti-Mullerian Hormone")

In [None]:
sns.displot(data = PCOS_woinf_ANOVA, x = "Age", hue = 'PCOS', kde = True).set(title = "PCOS vs. non-PCOS Ages")

In [None]:
sns.displot(data = PCOS_woinf_ANOVA, x = "Prolactin", hue = 'PCOS', kde = True).set(title = "PCOS vs. non-PCOS Prolactin Levels")