In [12]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split, KFold, cross_val_score 
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression   
from sklearn.impute import SimpleImputer  

In [10]:
url = "http://dibresources.jcbose.ac.in/ssaha4/pulmopred/public/training.csv.txt"
ds = pd.read_csv(url)
ds = ds[ds['Label'] != 'Non-obstructive']
X_without_diagnosis = ds.drop('Diagnosis', axis = 1)

X = X_without_diagnosis
Y = ds['Diagnosis']

label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X[column] = label_encoder.fit_transform(X[column])

In [13]:
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

In [20]:
classifier = LogisticRegression(max_iter=1000)
k = 5
k_fold = KFold(n_splits=k)
scores = cross_val_score(classifier, X_imputed, Y, cv=k_fold) 

In [3]:
X_without_diagnosis.head()

Unnamed: 0,FEV1 Pre-BD Value,FEV1 Pre-BD Pred,FEV1 Post-BD Value,FEV1 Post-BD Pred,FVC Pre-BD Value,FVC Pre-BD Pred,FVC Post-BD Value,FVC Post-BD Pred,FEF Pre-BD Value,FEF Pre-BD Pred,FEF Post-BD Value,FEF Post-BD Pred,Label
0,1.09,36.0,1.21,40.0,2.34,62.0,2.64,70.0,0.42,11.0,0.41,10.0,Obstructive
1,2.7,93.0,2.77,95.0,3.11,95.0,3.12,95.0,3.09,114.0,3.43,127.0,Obstructive
2,0.42,13.56,0.48,14.29,0.89,23.35,1.12,25.84,0.34,,0.27,,Obstructive
3,1.96,82.63,2.2,92.63,2.33,84.21,2.56,92.52,1.96,55.95,2.3,65.66,Obstructive
4,2.56,91.72,2.83,101.39,2.78,87.99,3.04,96.22,2.74,81.82,3.88,115.87,Obstructive


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) # 70% training and 30% test

In [17]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)


In [18]:
X_test

Unnamed: 0,FEV1 Pre-BD Value,FEV1 Pre-BD Pred,FEV1 Post-BD Value,FEV1 Post-BD Pred,FVC Pre-BD Value,FVC Pre-BD Pred,FVC Post-BD Value,FVC Post-BD Pred,FEF Pre-BD Value,FEF Pre-BD Pred,FEF Post-BD Value,FEF Post-BD Pred,Label
911,0.70,35.0,0.73,37.0,1.09,46.0,1.16,49.0,0.52,17.0,0.51,17.0,0
842,0.62,19.0,0.69,22.0,1.95,50.0,2.14,55.0,0.30,11.0,0.32,12.0,0
293,2.40,131.0,1.51,63.0,1.59,56.0,1.86,65.0,1.33,53.0,1.63,65.0,0
554,1.86,64.0,2.02,70.0,2.93,79.0,3.15,85.0,0.95,25.0,1.00,28.0,0
581,4.45,111.0,4.48,112.0,5.45,116.0,5.27,113.0,4.32,91.0,4.56,96.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,0.84,45.0,0.91,49.0,1.36,55.0,1.45,59.0,0.41,13.0,0.45,15.0,0
950,1.69,60.0,1.56,55.0,2.70,75.0,2.58,71.0,0.87,24.0,0.75,20.0,0
523,2.96,94.0,3.11,99.0,3.88,99.0,3.90,100.0,2.37,51.0,2.78,60.0,0
61,0.82,31.0,0.88,33.0,1.44,42.0,1.52,45.0,0.42,15.0,0.46,17.0,0


In [22]:
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print(y_pred)
print("Precision promedio: ", scores)

Accuracy: 0.7086092715231788
Precision promedio:  [0.69306931 0.76616915 0.67661692 0.53731343 0.59701493]


In [17]:
X_test.iloc[0]

FEV1 Pre-BD Value      0.70
FEV1 Pre-BD Pred      35.00
FEV1 Post-BD Value     0.73
FEV1 Post-BD Pred     37.00
FVC Pre-BD Value       1.09
FVC Pre-BD Pred       46.00
FVC Post-BD Value      1.16
FVC Post-BD Pred      49.00
FEF Pre-BD Value       0.52
FEF Pre-BD Pred       17.00
FEF Post-BD Value      0.51
FEF Post-BD Pred      17.00
Label                  0.00
Name: 911, dtype: float64

In [21]:
X_test.iloc[0:1]

Unnamed: 0,FEV1 Pre-BD Value,FEV1 Pre-BD Pred,FEV1 Post-BD Value,FEV1 Post-BD Pred,FVC Pre-BD Value,FVC Pre-BD Pred,FVC Post-BD Value,FVC Post-BD Pred,FEF Pre-BD Value,FEF Pre-BD Pred,FEF Post-BD Value,FEF Post-BD Pred,Label
911,0.7,35.0,0.73,37.0,1.09,46.0,1.16,49.0,0.52,17.0,0.51,17.0,0
842,0.62,19.0,0.69,22.0,1.95,50.0,2.14,55.0,0.3,11.0,0.32,12.0,0
293,2.4,131.0,1.51,63.0,1.59,56.0,1.86,65.0,1.33,53.0,1.63,65.0,0
554,1.86,64.0,2.02,70.0,2.93,79.0,3.15,85.0,0.95,25.0,1.0,28.0,0
581,4.45,111.0,4.48,112.0,5.45,116.0,5.27,113.0,4.32,91.0,4.56,96.0,0
248,1.16,58.0,1.28,65.0,1.58,64.0,1.72,70.0,0.88,45.0,0.99,51.0,0
806,0.64,20.0,0.69,21.0,1.68,42.0,1.72,43.0,0.27,6.0,0.27,6.0,0
555,1.47,57.0,1.75,67.0,1.81,58.0,2.23,71.0,1.47,42.0,1.62,46.0,0
329,2.72,70.0,3.13,81.0,3.77,83.0,4.34,85.0,1.92,44.0,2.28,50.0,0
49,2.62,60.0,3.13,72.0,3.4,63.0,3.68,69.0,2.19,48.0,3.33,73.0,0


In [23]:
clf.predict(X_test.iloc[0:1])

array(['Asthma'], dtype=object)

In [24]:
clf.predict([[0.65, 40, 0.5, 70, 2, 30, 4, 90, 0.3, 23, 0.4, 33, 1]])



array(['Asthma'], dtype=object)