# 1. Libraries used

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy.stats import chi2_contingency

# 2. Data Acquisition
---



## 2.1 Download the data directly



DATA SOURCE : https://www.kaggle.com/datasets/prasoonkottarathil/polycystic-ovary-syndrome-pcos/download?datasetVersionNumber=3

## 2.2 Code for converting the above downloaded data into a dataframe

In [2]:
import os;
os.listdir('/kaggle/input/polycystic-ovary-syndrome-pcos/')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/kaggle/input/polycystic-ovary-syndrome-pcos/'

In [None]:
pcos = pd.read_excel('/kaggle/input/polycystic-ovary-syndrome-pcos/PCOS_data_without_infertility.xlsx', sheet_name="Full_new") 

## 2.3 Confirm the data has been downloaded correctly by displaying the first 5 and last 5 records.

In [None]:
pcos.head() #first 5 records in the data set

In [None]:
pcos.tail() #last 5 records in the data set

## 2.4 Display the column headings, statistical information, description and statistical summary of the data.

In [None]:
pcos.columns #column heading

In [None]:
pcos.describe(include='all').transpose() #statistical summary of the data

In [None]:
pcos.info() #data type of the attributes

In [None]:
pcos.size #size of the data set

In [None]:
pcos.shape #shape of the dataset (Rows , columns)

In [None]:
pcos.info #shape of the dataset (Row & columns)

In [None]:
pcos.isnull #null data

Description & Statistical summary from the data 

## 2.5 Observations from the above. 


The data set is 24345 in size, with 541 rows and 45 columns. The majority of the attributes in our data set are integers and floats. We can also see some object data types, but based on the data set, we can disregard the entire attribute because it contains no useful information for the analysis.

# 3. Data Preparation

## 3.1 Check for 

* duplicate data
* missing data
* data inconsistencies


In [None]:
pcos.duplicated() #Duplicate Data

In [None]:
pcos.duplicated().sum() #No Duplicate data found

In [None]:
pcos.isna().sum() #missing data

In [None]:
(pcos.isna().sum() > 0).values

In [None]:
# drop the unnecessary columns
pcos =pcos.drop(['Sl. No', 'Patient File No.','Weight (Kg)',
       'Height(Cm) ','RR (breaths/min)', 'Cycle(R/I)',
       'Marraige Status (Yrs)','  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)',
       'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)',
       'AMH(ng/mL)', 'PRL(ng/mL)','PRG(ng/mL)', 'RBS(mg/dl)', 'hair growth(Y/N)',
       'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Endometrium (mm)','Unnamed: 44','Vit D3 (ng/mL)'],axis=1)

In [None]:
pcos.columns #Updated dataframe (Pcos)

In [None]:
num_columns=pcos.select_dtypes(exclude='object').columns

In [None]:
sns.boxplot(data=pcos.filter(num_columns), orient= "h")
plt.show()
plt.close()

## 3.2 Apply techiniques
* to remove duplicate data
* to impute or remove missing data
* to remove data inconsistencies


In [None]:
pcos.duplicated().sum() #no duplicates.

In [None]:
mask = pcos.isna().any(axis=1)
print(mask)

In [None]:
indexes=mask.index[mask].tolist()
print(indexes)

In [None]:
pcos.drop([156, 458], inplace=True) #removed missing data 

In [None]:
pcos.shape

In [None]:
pcos.isna().sum() #checking for missing data.

In [None]:
num_columns=pcos.select_dtypes(exclude='object').columns

In [None]:
sns.boxplot(data=pcos.filter(num_columns), orient= "h")
plt.show()
plt.close()

In [None]:
pcos.describe().transpose()

In [None]:
iqr =pcos.describe(percentiles=[0.25, 0.75]).loc['75%'].subtract(pcos.describe(percentiles=[0.25, 0.75]).loc['25%'])
print(iqr)

## 3.3 Encode categorical data

## 3.4 Text data

1. Remove special characters
2. Change the case (up-casing and down-casing).
3. Tokenization — process of discretizing words within a document.
4. Filter Stop Words.

## 3.4 Report

Duplicate Data:
    There is no duplicate data in the data set so ignoring this part.
Missing Data:
    Indexes 156, 458 had missing data, they're missing completely at random (MCAR), and it very minimal and satisfied the conditon to remove the data from the data set. So we removed the indexes completely.
Data inconsistencies:
    The data set is consistent throughout, so we ignored this part.

## 3.5 Target variables.

* Separate the data from the target such that the dataset is in the form of (X,y) or (Features, Label)

* Discretize / Encode the target variable or perform one-hot encoding on the target or any other as and if required.

* Report the observations


our target variable is pcos, but it is already encoded so we'll move on to data visualization.

# 4. Data Exploration using various plots



## 4.1 Scatter plot of each quantitative attribute with the target.

In [None]:
sns.scatterplot(x='PCOS (Y/N)', y=' Age (yrs)', data=pcos)
plt.show()
plt.close()

We have two cases, one with pcos and one without pcos, a scatter plot cannot provide enough information for this analysis, so we are plotting a relationship matrix, which clearly shows the relationship between the attributes.

In [None]:
fig, ax = plt.subplots(4,2, figsize=(35,45))
plt.style.use("classic")

sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Pregnant(Y/N)'], palette="viridis", kde=True, ax=ax[0,0])
ax[0,0].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['No. of aborptions'], palette="viridis", kde=True, ax=ax[0,1])
ax[0,1].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Weight gain(Y/N)'], palette="viridis", kde=True, ax=ax[1,0])
ax[1,0].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Skin darkening (Y/N)'], palette="viridis", kde=True, ax=ax[1,1])
ax[1,1].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Hair loss(Y/N)'], palette="viridis", kde=True, ax=ax[2,0])
ax[2,0].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Pimples(Y/N)'], palette="viridis", kde=True, ax=ax[2,1])
ax[2,1].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Fast food (Y/N)'], palette="viridis", kde=True, ax=ax[3,0])
ax[3,0].set_xlabel('PCOS (Y/N)',fontsize=20)
sns.histplot(x = pcos['PCOS (Y/N)'], hue = pcos['Reg.Exercise(Y/N)'], palette="viridis", kde=True, ax=ax[3,1])
ax[3,1].set_xlabel('PCOS (Y/N)',fontsize=20)

In [None]:
pcos.columns

In [None]:
for i in [' Age (yrs)', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'Hb(g/dl)', 'Cycle length(days)', 'Pregnant(Y/N)', 'No. of aborptions',
       'Waist:Hip Ratio', 'TSH (mIU/L)', 'Weight gain(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'Follicle No. (L)',
       'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)']:
  sns.set(rc = {'figure.figsize':(15,15)})
  pcos[pcos['PCOS (Y/N)'] == 1][i].value_counts().plot.bar()
  plt.title(i)
  plt.show()

## 4.2 EDA using visuals


In [None]:
#correlation plot.

corrmat = pcos.corr()
plt.subplots(figsize=(20,20))
sns.heatmap(corrmat,cmap="Pastel1", square=True);

In [None]:
plt.figure(figsize=(14,14))
k = 12 #number of variables with positive for heatmap
l = 3 #number of variables with negative for heatmap
cols_p = corrmat.nlargest(k, "PCOS (Y/N)")["PCOS (Y/N)"].index 
cols_n = corrmat.nsmallest(l, "PCOS (Y/N)")["PCOS (Y/N)"].index
cols = cols_p.append(cols_n) 

cm = np.corrcoef(pcos[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True,cmap="Pastel1", annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
corrmat["PCOS (Y/N)"].sort_values(ascending=False)

In [None]:
sns.pairplot(pcos)
plt.show()
plt.close()

In [None]:
pcos.columns

# 5. Data Wrangling



## 5.1 Univariate Filters 

In [None]:
pcos.columns

In [None]:
#Chi-Squared
def perform_chi2_test(data, col):
    # Create a contingency table for the column and the target variable
    contingency_table = pd.crosstab(data[col], data['PCOS (Y/N)'])
    
    # Perform the chi-square test
    chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
    
    # Print the results
    print(f"Column: {col}")
    print(f"Chi-square statistic: {chi2_stat}")
    print(f"P-value: {p_val}")
    print(f"Degrees of freedom: {dof}")
    print(f"Expected frequencies: {expected}")
    print("\n")


In [None]:
#Chit-Squared
for col in pcos.columns:
    perform_chi2_test(pcos, col)

In [None]:
#Mutual Information(Information Gain)
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
X = pcos.drop(['PCOS (Y/N)'], axis=1)
y = pcos['PCOS (Y/N)']
mi = mutual_info_classif(X, y)
mi_pcos = pd.DataFrame({'feature': X.columns, 'mutual_information': mi})
mi_pcos = mi_pcos.sort_values('mutual_information', ascending=False)
print(mi_pcos)

## 5.2 Report observations


In Chi-Squared "Age" and "Waist:hip Ratio" have p-value less than assumed 0.05 which means null hypothesis is satisfied for these two columns alone.
In Mutual Inforamation Follicle No. (R) have the highest value and more information in feature for predicting the PCOS. Addition to this other factors in descending contributes tot he relationship.

# 6. Implement Machine Learning Techniques

## 6.1 ML technique 1 + Justification

In [None]:
#Classification -- Decision Tree classifier
# create model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.inspection import permutation_importance
x = pcos.drop('PCOS (Y/N)', axis=1)
y = pcos['PCOS (Y/N)']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

In [None]:
dtc = DecisionTreeClassifier() 
dtc.fit(x_train, y_train) 
dtc_predicted_train = dtc.predict(x_train) 
dtc_predicted_test = dtc.predict(x_test) 
dtc_train_acc = accuracy_score(y_train, dtc_predicted_train) 
dtc_test_acc = accuracy_score(y_test, dtc_predicted_test) 
dtc_classification_report = classification_report(y_test, dtc_predicted_test)
dtc_confusion_matrix = confusion_matrix(y_test, dtc_predicted_test)

# print accuracy score, classification report, and confusion matrix
print(f"Training phase accuracy: {dtc_train_acc}")
print(f"Test phase accuracy: {dtc_test_acc}")
print(f"Classification Report : \n {dtc_classification_report}")
print(f"Confusion Matrix : \n{dtc_confusion_matrix}\n")

# consfusion matrix cross table
pd.crosstab(y_test, dtc_predicted_test, rownames=['True'], colnames=['Predicted'], margins=True)

## 6.2 ML technique 2 + Justification

In [None]:

#K-NN
knnc = KNeighborsClassifier() 
knnc.fit(x_train, y_train) 
knnc_predicted_train = knnc.predict(x_train) 
knnc_predicted_test = knnc.predict(x_test) 
knnc_train_acc = accuracy_score(y_train, knnc_predicted_train) 
knnc_test_acc = accuracy_score(y_test, knnc_predicted_test) 
knnc_classification_report = classification_report(y_test, knnc_predicted_test)
knnc_confusion_matrix = confusion_matrix(y_test, knnc_predicted_test)

# print accuracy score, classification report, and confusion matrix
print(f"Training phase accuracy: {knnc_train_acc}")
print(f"Test phase accuracy: {knnc_test_acc}")
print(f"Classification Report : \n {knnc_classification_report}")
print(f"Confusion Matrix : \n{knnc_confusion_matrix}\n")

# consfusion matrix cross table
pd.crosstab(y_test, knnc_predicted_test, rownames=['True'], colnames=['Predicted'], margins=True)

## 7. Conclusion 


#ML1 Decision tree result explained
The decision tree model has been trained and tested on a dataset. The results obtained are as follows:

Training phase accuracy: 1.0 - This means that the decision tree algorithm was able to correctly classify all the instances in the training dataset. However, a perfect accuracy on the training dataset does not necessarily guarantee a good performance on unseen data.

Test phase accuracy: 0.7962962962962963 - This means that the decision tree model was able to correctly classify 79.63% of the instances in the test dataset. The test phase accuracy is a better measure of the performance of the model on unseen data.

Classification Report: The classification report shows various metrics such as precision, recall, and F1 score for each class (0 and 1) in the dataset. The precision for class 0 is 0.88, which means that when the model predicts a sample as class 0, it is correct 88% of the time. The recall for class 0 is 0.84, which means that the model is able to correctly identify 84% of the instances of class 0 in the dataset. The F1-score for class 0 is 0.86, which is the harmonic mean of precision and recall. Similarly, the precision, recall, and F1-score for class 1 are 0.59, 0.68, and 0.63, respectively.

Confusion Matrix: The confusion matrix shows the number of true positives, true negatives, false positives, and false negatives predicted by the model. In this case, the confusion matrix shows that out of the 80 instances of class 0, the model correctly predicted 67 as class 0 and 13 as class 1. Out of the 28 instances of class 1, the model correctly predicted 19 as class 1 and 9 as class 0.

In summary, the decision tree model achieved a reasonable performance on the test dataset with an accuracy of 79.63%. However, the precision, recall, and F1-score for class 1 are relatively low compared to class 0, indicating that the model is not as good at identifying instances of class 1 as it is for class 0. The confusion matrix provides more detailed information about the performance of the model on each class.

#ML2 KNN results explained.

The KNN model has been trained and tested on a dataset. The results obtained are as follows:

Training phase accuracy: 0.8955916473317865 - This means that the KNN algorithm was able to correctly classify 89.56% of the instances in the training dataset.

Test phase accuracy: 0.8240740740740741 - This means that the KNN model was able to correctly classify 82.41% of the instances in the test dataset.

Classification Report: The classification report shows various metrics such as precision, recall, and F1 score for each class (0 and 1) in the dataset. The precision for class 0 is 0.86, which means that when the model predicts a sample as class 0, it is correct 86% of the time. The recall for class 0 is 0.91, which means that the model is able to correctly identify 91% of the instances of class 0 in the dataset. The F1-score for class 0 is 0.88, which is the harmonic mean of precision and recall. Similarly, the precision, recall, and F1-score for class 1 are 0.70, 0.57, and 0.63, respectively.

Confusion Matrix: The confusion matrix shows the number of true positives, true negatives, false positives, and false negatives predicted by the model. In this case, the confusion matrix shows that out of the 80 instances of class 0, the model correctly predicted 73 as class 0 and 7 as class 1. Out of the 28 instances of class 1, the model correctly predicted 16 as class 1 and 12 as class 0.

In summary, the KNN model achieved a reasonable performance on the test dataset with an accuracy of 82.41%. The precision, recall, and F1-score for class 1 are relatively lower compared to class 0, indicating that the model is not as good at identifying instances of class 1 as it is for class 0. The confusion matrix provides more detailed information about the performance of the model on each class.

## 8. Solution
