In [6]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd


In [7]:
import os

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')


In [9]:
mcd = pd.read_csv('MCD.csv')


In [10]:
mcd.columns

Index(['ClientID', 'CycleNumber', 'Group', 'CycleWithPeakorNot',
       'ReproductiveCategory', 'LengthofCycle', 'MeanCycleLength',
       'EstimatedDayofOvulation', 'LengthofLutealPhase', 'FirstDayofHigh',
       'TotalNumberofHighDays', 'TotalHighPostPeak', 'TotalNumberofPeakDays',
       'TotalDaysofFertility', 'TotalFertilityFormula', 'LengthofMenses',
       'MeanMensesLength', 'MensesScoreDayOne', 'MensesScoreDayTwo',
       'MensesScoreDayThree', 'MensesScoreDayFour', 'MensesScoreDayFive',
       'MensesScoreDaySix', 'MensesScoreDaySeven', 'MensesScoreDayEight',
       'MensesScoreDayNine', 'MensesScoreDayTen', 'MensesScoreDay11',
       'MensesScoreDay12', 'MensesScoreDay13', 'MensesScoreDay14',
       'MensesScoreDay15', 'TotalMensesScore', 'MeanBleedingIntensity',
       'NumberofDaysofIntercourse', 'IntercourseInFertileWindow',
       'UnusualBleeding', 'PhasesBleeding', 'IntercourseDuringUnusBleed',
       'Age', 'AgeM', 'Maristatus', 'MaristatusM', 'Yearsmarried', 'Wedding

In [11]:
# List of attributes to keep
attributes_to_keep = [
    'LengthofCycle',
    'MeanCycleLength',
    'LengthofMenses',
    'Age',
    'Height',
    'Weight',
    'Numberpreg',
    'Miscarriages',
    'Abortions',
    'Livingkids',
    'NumberofDaysofIntercourse',
    'BMI',
    'TotalDaysofFertility'# Keeping BMI as it's calculated from Height and Weight
]

# Dropping all other columns that are not in the list of attributes to keep
mcd = mcd[attributes_to_keep]

# Display the remaining columns
print(mcd.columns)

Index(['LengthofCycle', 'MeanCycleLength', 'LengthofMenses', 'Age', 'Height',
       'Weight', 'Numberpreg', 'Miscarriages', 'Abortions', 'Livingkids',
       'NumberofDaysofIntercourse', 'BMI', 'TotalDaysofFertility'],
      dtype='object')


In [12]:
mcd


Unnamed: 0,LengthofCycle,MeanCycleLength,LengthofMenses,Age,Height,Weight,Numberpreg,Miscarriages,Abortions,Livingkids,NumberofDaysofIntercourse,BMI,TotalDaysofFertility
0,29,27.33,5,36,63,120,3,0,0,3,5,21.254724111867,9
1,27,,5,,,,,,,,6,,6
2,29,,5,,,,,,,,5,,5
3,27,,5,,,,,,,,3,,6
4,28,,5,,,,,,,,5,,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660,29,,8,,,,,,,,8,,10
1661,28,,6,,,,,,,,11,,9
1662,28,,5,,,,,,,,7,,9
1663,40,,6,,,,,,,,3,,


In [13]:
mcd.dtypes

LengthofCycle                 int64
MeanCycleLength              object
LengthofMenses               object
Age                          object
Height                       object
Weight                       object
Numberpreg                   object
Miscarriages                 object
Abortions                    object
Livingkids                   object
NumberofDaysofIntercourse    object
BMI                          object
TotalDaysofFertility         object
dtype: object

In [14]:

# Convert columns to numeric
numeric_columns = [
    'MeanCycleLength',
    'LengthofMenses',
    'Age',
    'Height',
    'Weight',
    'Numberpreg',
    'Miscarriages',
    'Abortions',
    'Livingkids',
    'NumberofDaysofIntercourse',
    'BMI'
]

for col in numeric_columns:
    mcd[col] = pd.to_numeric(mcd[col], errors='coerce')

# Check the data types after conversion
print(mcd.dtypes)

LengthofCycle                  int64
MeanCycleLength              float64
LengthofMenses               float64
Age                          float64
Height                       float64
Weight                       float64
Numberpreg                   float64
Miscarriages                 float64
Abortions                    float64
Livingkids                   float64
NumberofDaysofIntercourse    float64
BMI                          float64
TotalDaysofFertility          object
dtype: object


In [15]:
mcd[numeric_columns] = mcd[numeric_columns].fillna(mcd[numeric_columns].median())

In [16]:
mcd

Unnamed: 0,LengthofCycle,MeanCycleLength,LengthofMenses,Age,Height,Weight,Numberpreg,Miscarriages,Abortions,Livingkids,NumberofDaysofIntercourse,BMI,TotalDaysofFertility
0,29,27.33,5.0,36.0,63.0,120.0,3.0,0.0,0.0,3.0,5.0,21.254724,9
1,27,29.50,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,6.0,24.138503,6
2,29,29.50,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,5.0,24.138503,5
3,27,29.50,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,3.0,24.138503,6
4,28,29.50,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,5.0,24.138503,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660,29,29.50,8.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,8.0,24.138503,10
1661,28,29.50,6.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,11.0,24.138503,9
1662,28,29.50,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,7.0,24.138503,9
1663,40,29.50,6.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,3.0,24.138503,


In [17]:
if 'BMI' not in mcd.columns:
    mcd['BMI'] = mcd['Weight'] / (mcd['Height'] ** 2)

def simulate_stress(length_of_cycle, mean_cycle_length):
    variation = abs(length_of_cycle - mean_cycle_length)
    if variation < 2:
        return 'Low'
    elif 2 <= variation < 5:
        return 'Medium'
    else:
        return 'High'

mcd['StressLevel'] = mcd.apply(lambda row: simulate_stress(row['LengthofCycle'], row['MeanCycleLength']), axis=1)
def simulate_activity(bmi):
    if bmi < 18.5:
        return 'Active'
    elif 18.5 <= bmi < 24.9:
        return 'Moderate'
    else:
        return 'Sedentary'

mcd['PhysicalActivity'] = mcd['BMI'].apply(simulate_activity)

# Save the updated dataset
mcd.to_csv('MCD_updated.csv', index=False)

print(mcd)

      LengthofCycle  MeanCycleLength  LengthofMenses   Age  Height  Weight  \
0                29            27.33             5.0  36.0    63.0   120.0   
1                27            29.50             5.0  30.5    65.0   145.5   
2                29            29.50             5.0  30.5    65.0   145.5   
3                27            29.50             5.0  30.5    65.0   145.5   
4                28            29.50             5.0  30.5    65.0   145.5   
...             ...              ...             ...   ...     ...     ...   
1660             29            29.50             8.0  30.5    65.0   145.5   
1661             28            29.50             6.0  30.5    65.0   145.5   
1662             28            29.50             5.0  30.5    65.0   145.5   
1663             40            29.50             6.0  30.5    65.0   145.5   
1664             24            29.50             5.0  30.5    65.0   145.5   

      Numberpreg  Miscarriages  Abortions  Livingkids  \
0     

In [18]:
mcd.isnull().sum().sum()

np.int64(0)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
mcd['StressLevel'] = le.fit_transform(mcd['StressLevel'])
mcd['PhysicalActivity'] = le.fit_transform(mcd['PhysicalActivity'])


In [20]:
X = mcd.drop('TotalDaysofFertility', axis=1)
y = mcd['TotalDaysofFertility']

In [21]:
mcd['StressLevel']


0       1
1       2
2       1
3       2
4       1
       ..
1660    1
1661    1
1662    1
1663    0
1664    0
Name: StressLevel, Length: 1665, dtype: int64

In [22]:
mcd['PhysicalActivity']

0       1
1       1
2       1
3       1
4       1
       ..
1660    1
1661    1
1662    1
1663    1
1664    1
Name: PhysicalActivity, Length: 1665, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
y_train

44      14
1128     5
1008    17
1279     8
914      6
        ..
604     13
1239     7
1025     7
270      8
945      6
Name: TotalDaysofFertility, Length: 1332, dtype: object

In [25]:
import pandas as pd

# Sample DataFrame creation (replace with your actual DataFrame)
# df = pd.DataFrame(...)  # Load or create your DataFrame

# Check for any non-numeric values in the DataFrame
non_numeric = mcd.applymap(lambda x: isinstance(x, str))
if non_numeric.any().any():
    print("There are strings in the DataFrame.")
    # Get the locations of the strings
    print(non_numeric[non_numeric].index.tolist())
else:
    print("No strings found in the DataFrame.")


There are strings in the DataFrame.
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214

  non_numeric = mcd.applymap(lambda x: isinstance(x, str))


In [26]:
X_train

Unnamed: 0,LengthofCycle,MeanCycleLength,LengthofMenses,Age,Height,Weight,Numberpreg,Miscarriages,Abortions,Livingkids,NumberofDaysofIntercourse,BMI,StressLevel,PhysicalActivity
44,28,29.5,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,5.0,24.138503,1,1
1128,26,29.5,4.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,1.0,24.138503,2,1
1008,27,29.5,6.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,10.0,24.138503,2,1
1279,32,29.5,2.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,0.0,24.138503,2,1
914,28,29.5,7.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,0.0,24.138503,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,36,29.5,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,4.0,24.138503,0,1
1239,28,29.5,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,0.0,24.138503,1,1
1025,32,29.5,5.0,30.5,65.0,145.5,2.0,0.0,0.0,2.0,4.0,24.138503,2,1
270,25,25.8,5.0,31.0,68.0,170.0,3.0,1.0,0.0,2.0,5.0,25.845588,1,2


In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Ensure X_train_scaled and y_train are pandas DataFrames/Series
if isinstance(X_train_scaled, np.ndarray):
    X_train_scaled = pd.DataFrame(X_train_scaled)
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train)

# Print shapes to check for consistency
print("Original X_train_scaled shape:", X_train_scaled.shape)
print("Original y_train shape:", y_train.shape)

# Align indices of X_train_scaled and y_train by finding the common indices
common_index = X_train_scaled.index.intersection(y_train.index)
X_train_scaled = X_train_scaled.loc[common_index]
y_train = y_train.loc[common_index]

# Check for missing values
print("Missing values in X_train_scaled:", X_train_scaled.isnull().sum().sum())
print("Missing values in y_train:", y_train.isnull().sum())

# Remove rows with missing values in X_train_scaled
X_train_scaled = X_train_scaled.dropna()
y_train = y_train.loc[X_train_scaled.index]  # Align y_train with cleaned X_train_scaled

# Convert y_train to numeric, handling errors
y_train = pd.to_numeric(y_train, errors='coerce')

# Remove rows with NaN values in y_train after conversion
y_train = y_train.dropna()
X_train_scaled = X_train_scaled.loc[y_train.index]  # Align X_train_scaled with cleaned y_train

# Check shapes again after cleaning
print("Cleaned X_train_scaled shape:", X_train_scaled.shape)
print("Cleaned y_train shape:", y_train.shape)

# Fit the model
mcd = LinearRegression()

# Fit the model to the cleaned data
mcd.fit(X_train_scaled, y_train)

# Print a success message
print("Model training completed successfully.")


Original X_train_scaled shape: (1332, 14)
Original y_train shape: (1332,)
Missing values in X_train_scaled: 0
Missing values in y_train: 0
Cleaned X_train_scaled shape: (1047, 14)
Cleaned y_train shape: (1047,)
Model training completed successfully.


In [29]:
print(y_train.unique())


[ 9.  5.  6.  8. 11. 10. 15.  7. 13. 14.  0. 12.  4.  2. 16. 18. 19. 17.
  3. 22. 21.  1. 27. 20.]


In [30]:
# Replace empty strings with NaN
y_train = y_train.replace(' ', np.nan)

# Drop rows with NaN values
y_train = y_train.dropna()

# Convert to numeric values
y_train = pd.to_numeric(y_train, errors='coerce')


In [31]:
print(y_train.dtypes)


float64


In [32]:
mcd.fit(X_train_scaled, y_train)

In [33]:
mcd

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Ensure y_pred is defined - this is just an example,
# you'll need to generate predictions from your model
y_pred = mcd.predict(X_test_scaled) # Assuming you have X_test_scaled and mcd defined

# Convert y_test and y_pred to pandas Series if they are numpy arrays
if isinstance(y_test, np.ndarray):
    y_test = pd.Series(y_test)
if isinstance(y_pred, np.ndarray):
    y_pred = pd.Series(y_pred)

# Check for non-numeric values in y_test and y_pred
print("Unique values in y_test:", y_test.unique())
print("Unique values in y_pred:", y_pred.unique())

# Replace non-numeric values with NaN, then drop NaNs
y_test = pd.to_numeric(y_test, errors='coerce')
y_pred = pd.to_numeric(y_pred, errors='coerce')

# Drop NaN values
valid_indices = ~y_test.isna() & ~y_pred.isna()
y_test = y_test[valid_indices]
y_pred = y_pred[valid_indices]

# Print shapes to ensure they match
print("Cleaned y_test shape:", y_test.shape)
print("Cleaned y_pred shape:", y_pred.shape)

# Calculate Mean Squared Error and R^2 Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Unique values in y_test: ['9' '6' '7' '5' '8' '13' '11' '0' '10' '15' '14' '12' '16' '2' '20' ' '
 '24' '19' '17' '4']
Unique values in y_pred: [ 7.65458113  8.09738877  8.10720344  7.69051459  8.08241486  8.1576573
  8.12276183  8.10808015  7.759983    7.74554324  8.05741468  7.62471394
  7.79458624  8.11265492  8.07297307  8.06781382  7.66572602  8.05799916
  7.64971411  8.10778791  7.71035552  7.69546224  8.52468836  7.76051715
  7.77025118  8.4647927   7.7454626   7.57588253  8.46141719  8.07762848
  7.69463586  7.76463841  8.17770983  8.57317999  7.70931753  7.75011801
  7.73078092  8.05246703  8.5195291   7.63577819  7.67067367  8.51636519
  8.12255022  8.13257649  7.35950876  7.85981764  8.13249585  8.03778536
  7.62030044  7.74038398  7.68460959  7.98246275  7.71530317  7.63061894
  8.02281144  7.68048833  8.07784008  7.77437244  7.72482559  7.66522218
  7.76043651  8.06315841  8.12821331  8.08299934  8.41125556  8.08794699
  7.67524844  8.89843523  8.05770692  7.69538161  8.50

In [35]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42)
}

# Step 7: Train and evaluate each model
best_model = None
best_score = float('inf')

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test_scaled)

    # Ensure both arrays have the same length
    y_pred = y_pred[:len(y_test)] # Trim y_pred to match y_test length

    # Calculate the Mean Squared Error and R^2 score
    mse = mean_squared_error(y_test, y_pred) # Use y_test, not y_test_original
    r2 = r2_score(y_test, y_pred)

    # Print the results
    print(f"{name} - MSE: {mse:.4f}, R^2: {r2:.4f}")

    # Select the best model based on MSE
    if mse < best_score:
        best_score = mse
        best_model = model

LinearRegression - MSE: 6.5849, R^2: 0.0084
Ridge - MSE: 6.5864, R^2: 0.0082
Lasso - MSE: 6.6459, R^2: -0.0008
DecisionTree - MSE: 10.7769, R^2: -0.6228
RandomForest - MSE: 9.3639, R^2: -0.4101


In [36]:
# Fine-tune the best model
if isinstance(best_model, (RandomForestRegressor, DecisionTreeRegressor)):
    param_grid = {
        'RandomForest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        },
        'DecisionTree': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    }

    grid_search = GridSearchCV(best_model, param_grid[type(best_model).__name__], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)

    print(f"Best Parameters: {grid_search.best_params_}")

    # Use the best model from GridSearchCV
    best_model = grid_search.best_estimator_

# Evaluate the final tuned model
y_pred = best_model.predict(X_test_scaled)

# Ensure both arrays have the same length - this is crucial!
y_pred = y_pred[:len(y_test)] # Trim y_pred to match y_test length

final_mse = mean_squared_error(y_test, y_pred)
final_r2 = r2_score(y_test, y_pred) # Calculate and assign final R^2 score

print(f"Final MSE: {final_mse:.4f}, Final R^2: {final_r2:.4f}") # Print the final MSE and R^2 score

Final MSE: 6.5849, Final R^2: 0.0084


In [37]:
# Print the results of the final model
print(f"Best Model: {type(best_model).__name__}")
print(f"Final Test Set MSE: {final_mse:.4f}")
print(f"Final Test Set R^2: {final_r2:.4f}")


Best Model: LinearRegression
Final Test Set MSE: 6.5849
Final Test Set R^2: 0.0084


In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load your data
mcd = pd.read_csv('MCD.csv')
attributes_to_keep = [
    'LengthofCycle',
    'MeanCycleLength',
    'LengthofMenses',

    'Age',
    'Height',
    'Weight',
    'Numberpreg',
    'Miscarriages',
    'Abortions',
    'Livingkids',
    'NumberofDaysofIntercourse',
    'BMI',
    'TotalDaysofFertility'
]
mcd = mcd[attributes_to_keep]
numeric_columns = attributes_to_keep[:-1]  # Exclude target column

# Convert columns to numeric and handle missing values
mcd[numeric_columns] = mcd[numeric_columns].apply(pd.to_numeric, errors='coerce')
mcd[numeric_columns] = mcd[numeric_columns].fillna(mcd[numeric_columns].median())

X = mcd.drop('TotalDaysofFertility', axis=1)
y = mcd['TotalDaysofFertility']

# Convert target variable to numeric, handling errors
y = pd.to_numeric(y, errors='coerce')

# Drop rows with NaN values in target variable
X = X.loc[~y.isna()]
y = y.dropna()

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

def preprocess_features(features):
    """
    Preprocess the input features.
    """
    features_df = pd.DataFrame([features])
    required_columns = X.columns.tolist()  # Use columns from the original data
    for col in required_columns:
        if col not in features_df.columns:
            features_df[col] = 0  # or another default value
    features_df = features_df[required_columns]
    features_scaled = scaler.transform(features_df)
    return features_scaled

def predict_total_days_of_fertility(features):
    """
    Predicts TotalDaysofFertility based on the input features.
    """
    preprocessed_features = preprocess_features(features)
    prediction = model.predict(preprocessed_features)
    return prediction[0]  # Return the prediction

def get_user_input():
    """
    Prompt the user for input and return it as a dictionary.
    """
    def prompt_for_float(prompt):
        while True:
            try:
                return float(input(prompt))
            except ValueError:
                print("Invalid input. Please enter a numeric value.")
    
    features = {}
    features['LengthofCycle'] = prompt_for_float("Enter Length of Cycle: ")
    features['MeanCycleLength'] = prompt_for_float("Enter Mean Cycle Length: ")
    features['LengthofMenses'] = prompt_for_float("Enter Length of Menses: ")
    features['Age'] = prompt_for_float("Enter Age: ")
    features['Height'] = prompt_for_float("Enter Height (in meters): ")
    features['Weight'] = prompt_for_float("Enter Weight (in kg): ")
    features['Numberpreg'] = prompt_for_float("Enter Number of Pregnancies: ")
    features['Miscarriages'] = prompt_for_float("Enter Number of Miscarriages: ")
    features['Abortions'] = prompt_for_float("Enter Number of Abortions: ")
    features['Livingkids'] = prompt_for_float("Enter Number of Living Kids: ")
    features['NumberofDaysofIntercourse'] = prompt_for_float("Enter Number of Days of Intercourse: ")
    features['BMI'] = prompt_for_float("Enter BMI: ")

    return features


# Example usage
user_features = get_user_input()
predicted_fertility_days = predict_total_days_of_fertility(user_features)
print(f"Predicted TotalDaysofFertility: {predicted_fertility_days:.2f}")


Predicted TotalDaysofFertility: 8.57


In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

# Assuming 'mcd' is your DataFrame
mcd = pd.read_csv('MCD.csv')
numeric_columns = [
    'LengthofCycle', 'MeanCycleLength', 'LengthofMenses',
    'Age', 'Height', 'Weight', 'Numberpreg', 'Miscarriages',
    'Abortions', 'Livingkids', 'NumberofDaysofIntercourse', 'BMI'
]
mcd[numeric_columns] = mcd[numeric_columns].apply(pd.to_numeric, errors='coerce')
mcd[numeric_columns] = mcd[numeric_columns].fillna(mcd[numeric_columns].median())

X = mcd[numeric_columns]

# Initialize and fit the scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [41]:
import pickle
from sklearn.ensemble import RandomForestRegressor

# Assuming 'X_train_scaled' and 'y_train' are prepared
model = RandomForestRegressor(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
