In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score

Defining the features

In [90]:
data = pd.read_csv('data.csv')

X_questions = data.iloc[:, 2:182]
X_questions = X_questions.drop('date_visit', axis=1) # Data não é relevante
X_questions = X_questions.drop(X_questions.columns[[46, 133, 158, 161]], axis=1) # Essas colunas são constantes

X_drugs = data.iloc[:, 185:]
X_drugs = X_drugs.drop(X_drugs.columns[[50,51,61,92,101,111,114,121,137,140,141,142,143,148,151,152]], axis=1) # Essas colunas são constantes


X = np.concatenate((X_questions, X_drugs), axis=1)
X = pd.DataFrame(X)

print(X_questions.shape)
print(X_drugs.shape)
print(X.shape)

(625, 175)
(625, 137)
(625, 312)


Defining the target

In [91]:
Y = data.iloc[:, 182:185]

y_vas30 = Y.iloc[:, 0:1].values.ravel()
y_vas50 = Y.iloc[:, 1:2].values.ravel()
y_gic = Y.iloc[:, 2:3].values.ravel()

y_perceived = np.logical_or(y_vas30, y_vas50) # Perceived improvement by the patient is defined as either VAS30 or VAS50

# The target is defined as the intersection of perceived improvement and GIC,
# this is because the patient must perceive improvement and the doctor must agree
y = np.logical_and(y_perceived, y_gic)

print(f"The percentage of ones inside y_gic is {(np.sum(y_gic)/y_gic.shape[0])*100:.2f}%")
print(f"The percentage of ones inside y_vas30 is {(np.sum(y_vas30)/y_vas30.shape[0])*100:.2f}%")
print(f"The percentage of ones inside y_vas50 is {(np.sum(y_vas50)/y_vas50.shape[0])*100:.2f}%")
print()
print(f"The percentage of ones inside y_perceived is {(np.sum(y_perceived)/y_perceived.shape[0])*100:.2f}%")
print(f"The percentage of ones inside y is {(np.sum(y)/y.shape[0])*100:.2f}%")
# Faz sentido a porcentagem de 1's ser baixa em y, pois como dito em nossa reunião a maior parte
# dos pacientes que sofrem com dor crônica não apresentam melhora.


The percentage of ones inside y_gic is 28.96%
The percentage of ones inside y_vas30 is 43.84%
The percentage of ones inside y_vas50 is 35.84%

The percentage of ones inside y_perceived is 43.84%
The percentage of ones inside y is 15.04%


In [92]:
similarity_vas50 = accuracy_score(y_gic, y_vas50) # y_gic is 'true', how similar is y_vas50 to it?
similarity_vas50 *= 100
print(f"The similarity percentage between y_gic and y_vas50 is {similarity_vas50:.2f}%")

similarity_vas30 = accuracy_score(y_gic, y_vas30) # y_gic is 'true', how similar is y_vas30 to it?
similarity_vas30 *= 100
print(f"The similarity percentage between y_gic and y_vas30 is {similarity_vas30:.2f}%")

similarity_perceived = accuracy_score(y_gic, y_perceived) # y_gic is 'true', how similar is y_perceived to it?
similarity_perceived *= 100
print(f"The similarity percentage between y_gic and y_perceived is {similarity_perceived:.2f}%")

The similarity percentage between y_gic and y_vas50 is 60.16%
The similarity percentage between y_gic and y_vas30 is 57.28%
The similarity percentage between y_gic and y_perceived is 57.28%


In [93]:
similarity_gic = accuracy_score(y, y_gic) # y is 'true', how similar is y_gic to it?
similarity_gic *= 100
print(f"The similarity percentage between y and y_gic is {similarity_gic:.2f}%")

similarity_vas50 = accuracy_score(y, y_vas50) # y is 'true', how similar is y_vas50 to it?
similarity_vas50 *= 100
print(f"The similarity percentage between y and y_vas50 is {similarity_vas50:.2f}%")

similarity_vas30 = accuracy_score(y, y_vas30) # y is 'true', how similar is y_vas30 to it?
similarity_vas30 *= 100
print(f"The similarity percentage between y and y_vas30 is {similarity_vas30:.2f}%")

similarity_perceived = accuracy_score(y, y_perceived) # y is 'true', how similar is y_perceived to it?
similarity_perceived *= 100
print(f"The similarity percentage between y and y_perceived is {similarity_perceived:.2f}%")

The similarity percentage between y and y_gic is 86.08%
The similarity percentage between y and y_vas50 is 74.08%
The similarity percentage between y and y_vas30 is 71.20%
The similarity percentage between y and y_perceived is 71.20%


Encoding the Boolean variables

In [94]:
le = LabelEncoder()
for col in X_questions.columns:
    if X_questions[col].dtype == 'bool':
        X_questions[col] = le.fit_transform(X_questions[col])

for col in X_drugs.columns:
    if X_drugs[col].dtype == 'bool':
        X_drugs[col] = le.fit_transform(X_drugs[col])

for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = le.fit_transform(X[col])

Imputing missing values

In [95]:
imp = SimpleImputer(strategy='mean')
imp.fit(X_questions)
X_questions = imp.transform(X_questions)

imp = SimpleImputer(strategy='mean')
imp.fit(X_drugs)
X_drugs = imp.transform(X_drugs)

imp = SimpleImputer(strategy='mean')
imp.fit(X)
X = imp.transform(X)

In [120]:
def trainReferenceMLP(X, x_name, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # 137 is the minimum number of features
    clf = MLPClassifier(hidden_layer_sizes=(137,), solver='sgd', learning_rate_init=0.001,
                        max_iter=2000, random_state=42, verbose=False)
    
    # Perform 5-fold cross-validation
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    mean = scores.mean()
    std_deviation = scores.std()
    variance = scores.var()

    print(f'Cross-validation scores for y using {x_name}: {scores}')
    print(f'mean: {mean: .4f}, std_deviation: {std_deviation: .4f}, variance: {variance: .4f}')
    print()
    # print(f'Test accuracy for y using {x_name}: {score*100:.2f}%')

In [121]:
trainReferenceMLP(X_questions, 'X_questions', y)
trainReferenceMLP(X_drugs, 'X_drugs', y)
trainReferenceMLP(X, 'X_questions and X_drugs', y)

Cross-validation scores for y using X_questions: [0.85106383 0.82978723 0.84042553 0.83870968 0.84946237]
mean:  0.8419, std_deviation:  0.0077, variance:  0.0001

Cross-validation scores for y using X_drugs: [0.85106383 0.84042553 0.84042553 0.84946237 0.84946237]
mean:  0.8462, std_deviation:  0.0047, variance:  0.0000

Cross-validation scores for y using X_questions and X_drugs: [0.85106383 0.84042553 0.81914894 0.83870968 0.84946237]
mean:  0.8398, std_deviation:  0.0114, variance:  0.0001



In [122]:
def featureSelection(X, y, k):
    X_new = SelectKBest(f_classif, k=k).fit_transform(X, y)
    return X_new

In [123]:
X_questions_selected = featureSelection(X_questions, y, 1)
X_drugs_selected = featureSelection(X_drugs, y, 1)
X_selected = featureSelection(X, y, 1)

# trainReferenceMLP(X_questions_selected, 'X_questions_selected', y)
# trainReferenceMLP(X_drugs_selected, 'X_drugs_selected', y)
# trainReferenceMLP(X_selected, 'X_questions_selected and X_drugs_selected', y)