Step 1: Import necessary libraries

In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np


Step 2: Load the dataset

In [2]:
# Step 2: Load the dataset (adjust the path if needed)
df = pd.read_csv('../ml_model_dataset/menstrual_cycle_dataset_big.csv')

# Show the shape and first few rows
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (2000, 8)


Unnamed: 0,age,cycle_length,period_duration,flow_intensity,pms_symptoms_score,spotting_between,stress_level,is_abnormal
0,26,28,5,0,10,0,0,0
1,42,40,6,1,5,1,6,1
2,21,15,7,1,6,0,9,1
3,25,30,6,1,5,0,5,0
4,41,33,1,1,8,0,10,1


 Step 3: Check for missing values

In [3]:
# Step 3: Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 age                   0
cycle_length          0
period_duration       0
flow_intensity        0
pms_symptoms_score    0
spotting_between      0
stress_level          0
is_abnormal           0
dtype: int64


Step 4: Separate features (X) and target (y)

In [4]:
# Step 4: Separate features and target column
X = df.drop("is_abnormal", axis=1)  # All columns except the label
y = df["is_abnormal"]               # Target: 0 = normal, 1 = abnormal

# Show first few rows of features
print("Feature sample:")
print(X.head())

# Show label sample
print("\nTarget sample:")
print(y.head())


Feature sample:
   age  cycle_length  period_duration  flow_intensity  pms_symptoms_score  \
0   26            28                5               0                  10   
1   42            40                6               1                   5   
2   21            15                7               1                   6   
3   25            30                6               1                   5   
4   41            33                1               1                   8   

   spotting_between  stress_level  
0                 0             0  
1                 1             6  
2                 0             9  
3                 0             5  
4                 0            10  

Target sample:
0    0
1    1
2    1
3    0
4    1
Name: is_abnormal, dtype: int64


Step 5: Split the dataset into training and testing sets

In [5]:
# Step 5: Split into training and testing sets
from sklearn.model_selection import train_test_split

# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (1600, 7)
Testing set size: (400, 7)


Step 6: Scale the features (Standardization)

In [6]:
# Step 6: Scale features using StandardScaler
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit on training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Show a sample of the scaled features
print("Sample of scaled training data:")
print(X_train_scaled[:5])


Sample of scaled training data:
[[-0.90344125  0.36181146  1.26053946 -0.00450766  1.58702004 -0.42521972
  -0.62111746]
 [ 1.28476907  0.50877727 -0.26854555  1.43794389  1.58702004 -0.42521972
  -0.30064408]
 [-1.54703252 -1.4017783   0.24114945 -0.00450766 -0.60629492 -0.42521972
   0.66077604]
 [ 1.67092384  1.53753797 -0.26854555 -1.44695921  1.58702004 -0.42521972
   1.30172279]
 [-1.28959601  0.94967471  1.26053946 -0.00450766 -1.23295634 -0.42521972
  -1.26206421]]


Step 7: Train a Machine Learning Model (Random Forest)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model on scaled training data
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Evaluate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {acc:.2f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy on test set: 0.96

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       226
           1       0.97      0.93      0.95       174

    accuracy                           0.96       400
   macro avg       0.96      0.95      0.96       400
weighted avg       0.96      0.96      0.96       400



Save my model and scaler

In [8]:
import joblib

# Save model
joblib.dump(model, '../ml_model/random_forest_model.joblib')

# Save scaler
joblib.dump(scaler, '../ml_model/scaler.joblib')


['../ml_model/scaler.joblib']

 Load model and scaler for testing

In [9]:
# Load model and scaler
loaded_model = joblib.load('../ml_model/random_forest_model.joblib')
loaded_scaler = joblib.load('../ml_model/scaler.joblib')

# Example new data (replace with your actual input)
import numpy as np

# Sample input: age=30, cycle_length=27, period_duration=5, flow_intensity=1, pms_symptoms_score=4, spotting_between=0, stress_level=3
new_data = np.array([[30, 27, 5, 1, 4, 0, 3]])

# Scale the input features
new_data_scaled = loaded_scaler.transform(new_data)

# Predict
prediction = loaded_model.predict(new_data_scaled)

print("Prediction (0=Normal, 1=Abnormal):", prediction[0])


Prediction (0=Normal, 1=Abnormal): 0




Try different inputs

In [10]:
import numpy as np

# Function to test input and print prediction
def test_cycle_input(age, cycle_length, period_duration, flow_intensity, pms_symptoms_score, spotting_between, stress_level):
    input_data = np.array([[age, cycle_length, period_duration, flow_intensity, pms_symptoms_score, spotting_between, stress_level]])
    input_scaled = loaded_scaler.transform(input_data)
    pred = loaded_model.predict(input_scaled)[0]
    label = "Abnormal" if pred == 1 else "Normal"
    print(f"Input: {input_data.flatten()}")
    print(f"Prediction: {label}\n")

# Examples to try
test_cycle_input(30, 27, 5, 1, 4, 0, 3)  # Expected: Normal
test_cycle_input(22, 40, 6, 2, 8, 1, 9)  # Expected: Abnormal (long cycle, heavy flow, spotting, high stress)
test_cycle_input(28, 19, 4, 0, 2, 0, 2)  # Expected: Abnormal (short cycle)
test_cycle_input(35, 30, 9, 1, 6, 0, 4)  # Expected: Abnormal (long period duration)


Input: [30 27  5  1  4  0  3]
Prediction: Normal

Input: [22 40  6  2  8  1  9]
Prediction: Abnormal

Input: [28 19  4  0  2  0  2]
Prediction: Abnormal

Input: [35 30  9  1  6  0  4]
Prediction: Abnormal





Interactive input + prediction code

In [None]:
# Make sure you have loaded your model and scaler as before

def get_user_input_and_predict():
    print("Enter menstrual cycle data:")
    age = int(input("Age (years): "))
    cycle_length = int(input("Cycle length (days): "))
    period_duration = int(input("Period duration (days): "))
    flow_intensity = int(input("Flow intensity (0=Light,1=Medium,2=Heavy): "))
    pms_symptoms_score = int(input("PMS symptoms score (0-10): "))
    spotting_between = int(input("Spotting between periods? (0=No, 1=Yes): "))
    stress_level = int(input("Stress level (0-10): "))

    # Create numpy array and scale
    input_data = np.array([[age, cycle_length, period_duration, flow_intensity, pms_symptoms_score, spotting_between, stress_level]])
    input_scaled = loaded_scaler.transform(input_data)

    # Predict
    pred = loaded_model.predict(input_scaled)[0]
    label = "Abnormal" if pred == 1 else "Normal"

    print(f"\nPrediction: Your menstrual cycle is likely: {label}")

# Call the function
get_user_input_and_predict()


In [None]:
from google.colab import files

# Download the model file
files.download('../ml_model/random_forest_model.joblib')

# Download the scaler file
files.download('../ml_model/scaler.joblib')
