## Data Loading

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import numpy as np

#Load file
dataframe = pd.read_csv("data.csv", delimiter=";")

print(f"We got {len(dataframe)} rows and {len(dataframe.columns)} columns")

## Continuous feature


### Data cleaning

In [None]:
#Extract Data
features = dataframe.iloc[:, [2,4,6,8,10,12,14,16,18,20,21,24,26,28,30,32,33,35,36,37,38,41,42]]
target = dataframe.iloc[:, [43]]

features.loc[features['CRP    '].str.startswith('<'),'CRP    ']=0
#Treat string data as NaN
pd.options.mode.chained_assignment = None

for col in features:
    features[col]=pd.to_numeric(features[col], errors='coerce') 

#Replace NaN by mean value
imp = SimpleImputer(missing_values=pd.NA, strategy='median')
features=pd.DataFrame(imp.fit_transform(features), columns=features.columns)
features

### PCA for fun

In [None]:
#Problem space has >10 dimensions, we fall into https://en.wikipedia.org/wiki/Curse_of_dimensionality. 
#Let's find the most relevant dimension using PCA

#We first need to normalize the data
scaledFeatures = pd.DataFrame(preprocessing.scale(features),columns = features.columns)

pca = PCA()

principalComponents = pca.fit_transform(scaledFeatures)

print (f"percentage of precision with the number of components :\n {pca.explained_variance_ratio_.cumsum()}")

#relationship from data to component
#print(pd.DataFrame(pca.components_,columns=x.columns))
principalDf = pd.DataFrame(data = principalComponents)

finalDf = pd.concat([principalDf, target], axis = 1)

In [None]:
#plot the first 2 principals components
fig = plt.figure()
ax = fig.add_subplot() 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

cond = finalDf.iloc[:,-1:] == 0
cond = cond.squeeze()
subset_a = finalDf[cond.squeeze()]
subset_b = finalDf[~cond]
plt.scatter(subset_a.iloc[:,1], subset_a.iloc[:,2], s=60, c='b', label='Sterile')
plt.scatter(subset_b.iloc[:,1], subset_b.iloc[:,2], s=60, c='r', label='Infected') 
ax.legend()
ax.grid()

### Feature selection technique

In [None]:
#Let's try to see the matrix of correlation
import seaborn as sns
scaledFeatures2 = scaledFeatures.copy()
scaledFeatures2['target'] = target
cor = scaledFeatures2.corr()
		     
plt.figure(figsize=(20,12))
sns.heatmap(cor,annot = True)

In [None]:
# ExhaustiveFeatureSelector, but take an eternity
# from mlxtend.feature_selection import ExhaustiveFeatureSelector
# from sklearn.ensemble import RandomForestClassifier
# X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
# efs=ExhaustiveFeatureSelector(RandomForestClassifier(),min_features=2, max_features=4,scoring = 'roc_auc',print_progress=True)
# efs = efs.fit(scaledFeatures, target.values.ravel())
# print('Best accuracy score: %.2f' % efs.best_score_)
# print('Best subset (indices):', efs.best_idx_)
# print('Best subset (corresponding names):', efs.best_feature_names_)

In [None]:
from sklearn.feature_selection import VarianceThreshold

v_Treshold = VarianceThreshold(threshold=0.1)
v_Treshold.fit(features)
v_Treshold.get_support()

### Model fitting

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
ffs = SequentialFeatureSelector(knn, n_features_to_select='auto',tol=0.000001)
ffs.fit(scaledFeatures.values,target.values.ravel())
ffs.get_support()

In [None]:
#Lets try to classify anyway, using https://medium.com/thrive-in-ai/classification-algorithms-in-python-5f58a7a27b88

from sklearn.model_selection import train_test_split

In [None]:
#Using Linear Regression
from sklearn.linear_model import LogisticRegression
X_train, X_test, Y_train,Y_test = train_test_split(features, target, test_size=0.2)
lr_clf = LogisticRegression().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
#Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = RandomForestClassifier().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
#Using naive_bayes
from sklearn.naive_bayes import GaussianNB
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = GaussianNB().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
#Using Nearest Neighbours Classification
from sklearn import neighbors
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = neighbors.KNeighborsClassifier().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
#using SVM Classifier
from sklearn import svm
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = svm.SVC(kernel = 'linear').fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
#using gradient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = GradientBoostingClassifier().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)

In [None]:
from sklearn import tree
X_train, X_test, Y_train,Y_test = train_test_split(scaledFeatures, target, test_size=0.2)
lr_clf = tree.DecisionTreeClassifier().fit(preprocessing.scale(X_train),Y_train.values.ravel())
lr_clf.score(preprocessing.scale(X_test),Y_test)
tree.plot_tree(lr_clf)