<h3>The following Notebook defines the model used for pregnancy risk classification</h3>
<p> This file should only be used for experimental or referential purposes, and not for updating the model deployed </p>

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
# Load the dataset from a CSV file
data = pd.read_csv('Maternal Health Risk Data Set.csv')

In [13]:
# Check for missing or invalid data and handle it appropriately
data.dropna(inplace=True)
if any(data.isnull().any()):
  raise ValueError('Invalid data detected')

# Split the data into feature columns and the target column
X = data[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'BodyTemp', 'HeartRate']]
y = data['RiskLevel']

# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Check for imbalanced data and balance it using oversampling or undersampling
if len(y[y == 'high risk']) / len(y) < 0.1:
  # Use oversampling to balance the data
  from imblearn.over_sampling import SMOTE
  smote = SMOTE(random_state=0)
  X_resampled, y_resampled = smote.fit_sample(X, y)
else:
  # Use undersampling to balance the data
  from imblearn.under_sampling import RandomUnderSampler
  rus = RandomUnderSampler(random_state=0)
  X_resampled, y_resampled = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [14]:
print(X_test)

[[-0.36173812  1.45702716  1.6960543  -0.25091417 -0.4852155   0.70481475]
 [-0.65874416  0.36976548  0.25502279 -0.55468943 -0.4852155   0.21005383]
 [-0.65874416 -2.07657332 -1.90652448 -0.37242428 -0.4852155  -1.76898989]
 ...
 [ 1.12329206 -1.26112705 -1.18600873  2.81721597  1.70343448 -0.53208757]
 [-0.06473208  0.91339632 -0.46549297 -0.37242428 -0.4852155   0.45743429]
 [-0.9557502   0.36976548  0.25502279 -0.61544449  2.43298447  0.21005383]]


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [16]:
# Create and train the Random Forest model
model = RandomForestClassifier(n_estimators=200)

# Tune the hyperparameters of the model using GridSearchCV
param_grid = {'max_depth': [5, 10, 20], 'min_samples_split': [2, 5, 10]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Use the best parameters from the grid search to train the model
model = grid_search.best_estimator_
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.89
F1 Score: 0.89


In [17]:
#Testing the model using a random sample
import numpy
arr = [-0.80724718, -1.26112705, -0.82575085,  0.69078914,  1.70343448, -0.53208757]
testData = numpy.array(arr)
print(testData)

[-0.80724718 -1.26112705 -0.82575085  0.69078914  1.70343448 -0.53208757]


In [20]:
#Necessary to reshape the input sample
prediction = model.predict(testData.reshape(1,-1))
print(prediction)

['high risk']


<b>DO NOT RUN THIS CELL UNLESS YOU WANT TO UPDATE THE MODEL. EVEN IF YOU DO, DO NOT PUSH IT TO THE REPO</b>

In [23]:
# Save the model
import pickle
with open('model.pkl', 'wb') as file:
  pickle.dump(model, file)
