In [18]:
# Import algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost import XGBClassifier

# Import validation 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score,confusion_matrix, mean_absolute_error , r2_score , mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import precision_recall_fscore_support

# Import other 
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

from ipynb.fs.full.DataVis import select_features;

Load the data

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('./Data/fetal_health.csv')

Select the prediction target, remove the rows with missing target

In [20]:
# Remove rows with missing target, separate target from predictors
y = X.fetal_health              
X.drop(['fetal_health'], axis=1, inplace=True)

Break off validation set from training data

### Create baseline models
No preprocessing, normalisation, etc.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
def score_base_model(xtrain, ytrain, ytest,model):
    model.fit(xtrain, ytrain)
    y_preds = model.predict(X_test)
    base_score = accuracy_score(ytest, y_preds)
    return base_score.mean()

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process.kernels import RBF

baseline_model_rf = RandomForestClassifier(random_state=0)

le = LabelEncoder()
y_train_xgb = y_train.copy()
y_test_xgb = y_test.copy()
y_train_xgb = le.fit_transform(y_train_xgb)
y_test_xgb = le.fit_transform(y_test_xgb)
baseline_model_XGboost = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4)

baseline_model_GaussianNB = GaussianNB()

baseline_GaussianNB_score = score_base_model(X_train,y_train,y_test, baseline_model_GaussianNB)
baseline_rf_score = score_base_model(X_train,y_train,y_test,baseline_model_rf)
baseline_XGboost_score = score_base_model(X_train,y_train_xgb,y_test_xgb,baseline_model_XGboost)

In [24]:
print("Random Forest model accuracy: {}%".format((baseline_rf_score*100).round()))
print("XG boost model accuracy: {}%".format((baseline_XGboost_score*100).round()))
print("Gaussian NB model accuracy: {}%".format((baseline_GaussianNB_score*100).round()))

Random Forest model accuracy: 94.0%
XG boost model accuracy: 95.0%
Gaussian NB model accuracy: 82.0%


### Models using different feature selection methods

In [25]:
X_model_1, fs = select_features(X.copy(), y, method="KBest")

X_model_1_train, X_model_1_test, y_model_1_train, y_model_1_test = train_test_split(X_model_1, y)

le = LabelEncoder()
y_train_xgb = le.fit_transform(y_model_1_train)
y_test_xgb = le.fit_transform(y_model_1_test)
model_1_xgb = XGBClassifier(n_estimators=1000,learning_rate=0.05,n_jobs=4)
model_1_xgb.fit(X_model_1_train,y_train_xgb)
predictions_xgb = model_1_rf.predict(X_model_1_test)
score_model_1_xgb = accuracy_score(y_test_xgb, predictions_xgb)

In [26]:
print(score_model_1_xgb)

0.005639097744360902


In [10]:
X_model_1.shape

(2126, 18)