## Task 1: A classification example: fetal heart condition diagnosis
***
## Group Members:
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
### Chukwudumebi Ubogu, e-mail: gusuboch@student.gu.se

### Step 1. Reading the data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the CSV file.
data = pd.read_csv('CTG.csv', skiprows=1)

# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data = data[selected_cols].dropna()

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)

# Map the diagnosis code to a human-readable label.
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]

Y = data_shuffled['NSP'].apply(to_label)

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

In [4]:
# Print the first few rows
X.head()

Unnamed: 0,LB,AC,FM,UC,DL,DS,DP,ASTV,MSTV,ALTV,...,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency
658,130.0,1.0,0.0,3.0,0.0,0.0,0.0,24.0,1.2,12.0,...,35.0,120.0,155.0,1.0,0.0,134.0,133.0,135.0,1.0,0.0
1734,134.0,9.0,1.0,8.0,5.0,0.0,0.0,59.0,1.2,0.0,...,109.0,80.0,189.0,6.0,0.0,150.0,146.0,150.0,33.0,0.0
1226,125.0,1.0,0.0,4.0,0.0,0.0,0.0,43.0,0.7,31.0,...,21.0,120.0,141.0,0.0,0.0,131.0,130.0,132.0,1.0,0.0
1808,143.0,0.0,0.0,1.0,0.0,0.0,0.0,69.0,0.3,6.0,...,27.0,132.0,159.0,1.0,0.0,145.0,144.0,146.0,1.0,0.0
825,152.0,0.0,0.0,4.0,0.0,0.0,0.0,62.0,0.4,59.0,...,25.0,136.0,161.0,0.0,0.0,159.0,156.0,158.0,1.0,1.0


In [5]:
# Print the shape of the input and output data.
X.shape

(2126, 21)

In [6]:
Y.value_counts()

normal        1655
suspect        295
pathologic     176
Name: NSP, dtype: int64

### Step 2. Training the baseline classifier

In [7]:
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy='most_frequent')

In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, Xtrain, Ytrain)

print("DummyClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

DummyClassifier Accuracy: 0.78 (+/- 0.00)


### Step 3. Trying out some different classifiers

In [9]:
# Tree-based classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Evaluate the performance of a classifier using cross-validation.
def evaluate_tree_classifiers(Xtrain, Ytrain):
    # Define a list of classifiers
    classifiers = [
        ("DecisionTreeClassifier", DecisionTreeClassifier()),
        ("RandomForestClassifier", RandomForestClassifier()),
        ("GradientBoostingClassifier", GradientBoostingClassifier())
    ]

    # Iterate over classifiers and perform cross-validation
    for name, clf in classifiers:
        scores = cross_val_score(clf, Xtrain, Ytrain)
        print(f"{name} Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

evaluate_tree_classifiers(Xtrain, Ytrain)

DecisionTreeClassifier Accuracy: 0.92 (+/- 0.03)
RandomForestClassifier Accuracy: 0.94 (+/- 0.02)
GradientBoostingClassifier Accuracy: 0.95 (+/- 0.02)


In [10]:
# Linear classifiers: Perceptron
from sklearn.linear_model import Perceptron

clf = Perceptron()
scores = cross_val_score(clf, Xtrain, Ytrain)
print("Perceptron Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Perceptron Accuracy: 0.83 (+/- 0.12)


In [11]:
# Linear classifiers: Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='newton-cg')
scores = cross_val_score(clf, Xtrain, Ytrain)
print("LogisticRegression Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LogisticRegression Accuracy: 0.89 (+/- 0.02)


In [12]:
# Lineear classifiers: LinearSVC
from sklearn.svm import LinearSVC

clf = LinearSVC(dual=False)
scores = cross_val_score(clf, Xtrain, Ytrain)
print("LinearSVC Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LinearSVC Accuracy: 0.90 (+/- 0.03)


In [13]:
# Neural Network Classifier
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier()
scores = cross_val_score(clf, Xtrain, Ytrain)
print("MLPClassifier Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

MLPClassifier Accuracy: 0.87 (+/- 0.04)


### Step 4. Final evaluation

In [14]:
from sklearn.metrics import accuracy_score
# Train the classifier on the training set.
clf = GradientBoostingClassifier()
clf.fit(Xtrain, Ytrain)
Yguess = clf.predict(Xtest)
print("GradientBoostingClassifier Accuracy: %0.2f" % accuracy_score(Ytest, Yguess))

GradientBoostingClassifier Accuracy: 0.93


## Task 2: Decision trees for classification