In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load the dataset
full_new_df = pd.read_excel('dataset.xlsx', sheet_name='Full_new')

non_numeric_columns = full_new_df.columns[(full_new_df.dtypes == 'object')]
for col in non_numeric_columns:
    full_new_df[col] = pd.to_numeric(full_new_df[col], errors='coerce')

In [3]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
full_new_df_imputed = imputer.fit_transform(full_new_df.copy())
full_new_df_imputed = pd.DataFrame(full_new_df_imputed, columns=full_new_df.columns)

X = full_new_df_imputed.drop(['Sl. No', 'Patient File No.', 'PCOS (Y/N)'], axis=1)
y = full_new_df_imputed['PCOS (Y/N)'].astype(int)

In [4]:
bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(X, y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)
featureScores = pd.concat([df_columns, df_scores], axis=1)
featureScores.columns = ['Feature', 'Score']
best_features = featureScores.nlargest(10, 'Score')['Feature'].values
X_selected = X[best_features]

  f = msb / msw


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [6]:
model_filename = 'model.joblib'
joblib.dump(logreg, model_filename)
print(f"Trained model saved as {model_filename}")

Trained model saved as model.joblib


In [7]:
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Non-PCOS', 'PCOS'])

In [8]:
print(f'Accuracy of the Logistic Regression model: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy of the Logistic Regression model: 0.89
Classification Report:
              precision    recall  f1-score   support

    Non-PCOS       0.91      0.94      0.92        77
        PCOS       0.83      0.78      0.81        32

    accuracy                           0.89       109
   macro avg       0.87      0.86      0.86       109
weighted avg       0.89      0.89      0.89       109



In [9]:
X_head = X_test.head()
y_head_pred = logreg.predict(X_head)
input_output_df = X_head.copy()
input_output_df['Predicted PCOS (Y/N)'] = y_head_pred
print(input_output_df)

     Follicle No. (R)  Follicle No. (L)  Skin darkening (Y/N)  \
229               5.0               6.0                   1.0   
73                4.0               3.0                   0.0   
352               5.0               4.0                   1.0   
86               10.0               8.0                   0.0   
470              10.0               8.0                   1.0   

     hair growth(Y/N)  Weight gain(Y/N)  Cycle(R/I)  Fast food (Y/N)  \
229               0.0               1.0         2.0              1.0   
73                0.0               1.0         2.0              0.0   
352               1.0               0.0         2.0              1.0   
86                0.0               0.0         4.0              0.0   
470               1.0               0.0         4.0              1.0   

     Pimples(Y/N)  AMH(ng/mL)  Weight (Kg)  Predicted PCOS (Y/N)  
229           0.0       12.00         74.3                     0  
73            0.0        0.35         70.0

In [10]:
selected_features = ['Follicle No. (R)', 'Follicle No. (L)', 'Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)', 'Cycle(R/I)', 'Fast food (Y/N)', 'Pimples(Y/N)', 'AMH(ng/mL)', 'Weight (Kg)']

In [11]:
loaded_model = joblib.load(model_filename)

In [12]:
def predict_pcos(input_values):
    input_df = pd.DataFrame([input_values], columns=selected_features)
    prediction = loaded_model.predict(input_df)
    return 'PCOS' if prediction[0] == 1 else 'Non-PCOS'

In [13]:
input_example = [5, 6, 1, 0, 1, 2, 1, 0, 12, 74.3]
prediction_example = predict_pcos(input_example)
print('The prediction for the input is:', prediction_example)

The prediction for the input is: Non-PCOS


In [14]:
input_example = [15, 13, 0, 0, 0, 2, 1, 1, 6.63, 68.8]
prediction_example = predict_pcos(input_example)
print('The prediction for the input is:', prediction_example)

The prediction for the input is: PCOS


In [15]:
from sklearn.svm import SVC 
svcmodel=SVC(kernel='rbf', random_state=0,probability=True)
svcmodel.fit(X_train, y_train)
# predict probabilities
svcacc = svcmodel.score(X_test, y_test)
print(svcacc )

0.8073394495412844


In [16]:
from sklearn.naive_bayes import MultinomialNB  
 
model_naive = MultinomialNB().fit(X_train, y_train)
model_naive.fit(X_train, y_train)
# predict probabilities
nbacc = model_naive.score(X_test, y_test)
print(nbacc )
 

0.7981651376146789


In [20]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# Training the classifier
clf.fit(X_train, y_train)
B_pred = clf.predict(X_test)
racc=clf.score(X_test, y_test)
print("Accuracy:", racc)

Accuracy: 0.8623853211009175


In [21]:
from sklearn.neighbors import KNeighborsClassifier
  
knn = KNeighborsClassifier(n_neighbors=3)

# Train the classifier on the training set
knn.fit(X_train, y_train)

# Predicting the labels for the test set
y_pred = knn.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8165137614678899


In [22]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training set
dt_classifier.fit(X_train, y_train)

# Predicting the labels for the test set
y_pred = dt_classifier.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7889908256880734
