In [1]:
from dataclasses import dataclass
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from sklearn.metrics import auc
from sklearn.metrics import roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Set model parameters

In [2]:
# Set outcome
outcome = 'pH'

# Threshold to define outcome as abnormal or not
outcome_threshold = 7.15

## Load data

In [3]:
# Define file paths
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data = './ctu-data_csv'
    meta = 'metadata.csv'


paths = Paths()

In [4]:
# Set up dictionary for CSV files
csv_files = dict()

# Load files into dictionary, but remove metadata
filenames = os.listdir(paths.data)
filenames.remove(paths.meta)
for file in filenames:
    # Get filename and file extension
    filename, file_extension = os.path.splitext(os.path.join(paths.data, file))
    # Load data and save to dict with filename (without path/csv) as index
    if file_extension == '.csv':
        filename_short = filename.split(os.sep)[-1]
        csv_files[filename_short] =  pd.read_csv(os.path.join(paths.data, file))
        
print(csv_files.keys())

dict_keys(['1012', '1006', '1210', '1204', '2043', '1238', '1402', '1364', '1370', '1416', '1358', '1199', '1166', '1172', '1173', '1167', '1198', '1359', '1371', '1417', '1403', '1365', '1239', '2042', '1205', '1211', '1007', '1013', '1005', '1011', '1039', '1207', '1213', '2040', '1398', '1415', '1373', '1367', '1401', '1429', '1171', '1165', '1159', '1158', '1164', '1170', '1428', '1366', '1400', '1414', '1372', '1399', '2041', '1212', '1206', '1038', '1010', '1004', '1028', '1014', '2045', '1202', '1216', '1389', '1438', '1376', '1410', '1404', '1362', '1148', '1174', '1160', '1161', '1175', '1149', '1405', '1363', '1377', '1411', '1439', '1388', '1217', '1203', '2044', '1015', '1001', '1029', '1017', '1003', '1229', '2046', '1215', '1201', '1349', '1361', '1407', '1413', '1375', '1188', '1163', '1177', '1176', '1162', '1189', '1412', '1374', '1360', '1406', '1348', '1200', '1214', '1228', '1002', '1016', '1071', '1065', '1059', '1298', '1273', '1501', '2008', '1267', '2020', '2034

In [5]:
# Load meta data and transform
metadata = pd.read_csv(os.path.join(paths.data, paths.meta),
                       index_col='parameter')
metadata.head()

Unnamed: 0_level_0,1220,1234,1208,1038,1004,1010,1022,1036,2041,1206,...,1079,1290,1284,1077,1063,1088,1253,1247,2028,2014
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pH,7.3,7.29,7.23,7.33,7.3,7.35,7.28,7.08,7.37,7.24,...,7.17,7.36,7.16,7.22,7.16,7.25,7.32,7.32,7.18,7.32
BDecf,3.52,2.5,5.84,2.72,5.19,5.2,1.53,8.11,3.69,2.06,...,7.91,3.88,5.07,6.69,5.56,2.58,0.89,-0.86,4.82,2.28
pCO2,6.0,6.5,6.6,5.7,5.5,4.7,7.0,9.3,4.8,7.7,...,7.1,4.9,8.5,6.5,8.3,7.3,6.4,6.9,8.1,6.0
BE,-4.7,-4.2,-7.4,-4.0,-6.4,-5.9,-3.0,-11.2,-3.1,-4.0,...,-9.9,-4.1,-7.3,-8.0,-7.9,-4.5,-1.7,-1.0,-7.2,-3.2
Apgar1,9.0,8.0,9.0,10.0,8.0,8.0,10.0,8.0,9.0,9.0,...,8.0,9.0,10.0,6.0,9.0,8.0,8.0,9.0,8.0,10.0


## calculate expert feature

In [12]:
from utils.calculate_expert_feature import calculate_features
from utils.clean_fhr import clean_fhr
fhr_dict = {}
feature_dict = {}
# ct = 0
for key, value in csv_files.items():
    # ct += 1
    # print(f'{key}/{value}')
    fhr_dict[key] = clean_fhr(value.FHR)
    feature_dict[key] = calculate_features(fhr_dict[key])
    # if True:
        # print(f'{ct}/{len(csv_files.items())}')
        


KeyboardInterrupt: 

## Define x and y

In [7]:
# Set x as the signals from the dictionary
X_feature = list(feature_dict.values())
print(type(X_feature[0]))

# Y is a boolean, true/false for each depending on outcome
# We set order of metadata according to dictionary keys so they match up
y = (metadata[X_feature.keys()].loc[outcome] < outcome_threshold).values
y = y.astype(np.int8)
print(y)

NameError: name 'feature_dict' is not defined

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_feature, y, test_size = 0.25, random_state=42)

print(np.bincount(y_train))
print(np.bincount(y_test))

NameError: name 'X_feature' is not defined

## Scale data

In [9]:
def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_sc = sc.transform(X_train)
    test_sc = sc.transform(X_test)
    
    return train_sc, test_sc

In [10]:
# Scale X data
X_train_sc, X_test_sc = scale_data(X_train, X_test)

n_classes = len(np.unique(y_train))
print('the number of train data', X_train_sc.shape)
print('the number of test data', X_test_sc.shape)
print(type(X_train_sc))

NameError: name 'X_train' is not defined

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import evaluate
# from xgboost import XGBClassifier
# KNN
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_sc, y_train)

y_pred = model_knn.predict(X_test_sc)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

evaluate.evaluate_model(model_knn, X_test_sc, y_test)

NameError: name 'X_train_sc' is not defined

## FHR classify base on SVM 

In [12]:
model_svc = SVC(kernel='linear')
model_svc.fit(X_train_sc, y_train)

evaluate.evaluate_model(model_svc, X_test_sc, y_test)

NameError: name 'X_train_sc' is not defined

In [13]:
model = SVC(kernel='rbf')
model.fit(X_train_sc, y_train)

evaluate.evaluate_model(model, X_test_sc, y_test)

NameError: name 'X_train_sc' is not defined

## Random Forest Classifier

In [14]:
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train_sc, y_train)

evaluate.evaluate_model(model_rfc, X_test_sc, y_test)

NameError: name 'X_train_sc' is not defined

## XGBoost

In [15]:
from xgboost import XGBClassifier
model_xg = XGBClassifier(eval_metric='mlogloss', objective='binary:logistic', use_label_encoder=False)
model_xg.fit(X_train_sc, y_train)

evaluate.evaluate_model(model_xg, X_test_sc, y_test)

ModuleNotFoundError: No module named 'xgboost'

In [16]:
evaluate.plot_auc_roc([('XGBoost', model_xg), ('RandomForest', model_rfc), ('SupportVectorMachines', model_svc), ('KNN', model_knn)], X_test_sc, y_test)

NameError: name 'model_xg' is not defined